diff --git a/README.md b/README.md index efa7440e..86041ed5 100644 --- a/README.md +++ b/README.md @@ -8,19 +8,19 @@ ### Information Advanced RVC Inference presents itself as a state-of-the-art web UI crafted to streamline rapid and effortless inference. This comprehensive toolset encompasses a model downloader, a voice splitter, and the added efficiency of batch inference. -Please support the original RVC. This inference won't be possible to make without it.
-[![Original RVC Repository](https://img.shields.io/badge/Github-Original%20RVC%20Repository-blue?style=for-the-badge&logo=github)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) +Please support the Applio. This inference won't be possible to make without it.
+[![Original Applio](https://img.shields.io/badge/Github-Original%20Applio%20Repository-blue?style=for-the-badge&logo=github)](https://github.com/IAHispano/Applio) #### Features - Support V1 & V2 Model ✅ - Youtube Audio Downloader ✅ - Audio-Separator (Voice Splitter) [Internet required for downloading model] ✅ - Model Downloader ✅ +- TTS Support #### Currently Working - Settings 🛠 - Microphone Support -- TTS Support - Gradio WebUI ### Installation diff --git a/assets/hubert/.gitkeep b/assets/hubert/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/assets/rmvpe/.gitkeep b/assets/rmvpe/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/audio_input/.gitkeep b/audio_input/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/download_audio.py b/download_audio.py new file mode 100644 index 00000000..baffc000 --- /dev/null +++ b/download_audio.py @@ -0,0 +1,68 @@ +import os +import argparse +import yt_dlp + + +class MyLogger(object): + def debug(self, msg): + print("[DEBUG]", msg) + + def warning(self, msg): + print("[WARNING]", msg) + + def error(self, msg): + print("[ERROR]", msg) + + +def progress_hook(info): + status = info.get("status") + if status == "downloading": + downloaded = info.get("downloaded_bytes", 0) + total = info.get("total_bytes", info.get("total_bytes_estimate", 0)) + if total: + percent = downloaded / total * 100 + print(f"[DEBUG] Downloading: {percent:.2f}%") + elif status == "finished": + print("[DEBUG] Download finished, now converting to WAV...") + + +def download_youtube_audio(url, output_path): + os.makedirs(output_path, exist_ok=True) + + outtmpl = os.path.join(output_path, "%(title)s.%(ext)s") + + ydl_opts = { + "format": "bestaudio/best", + "outtmpl": outtmpl, + "logger": MyLogger(), + "progress_hooks": [progress_hook], + "postprocessors": [ + { + "key": "FFmpegExtractAudio", + "preferredcodec": "wav", + "preferredquality": "192", + } + ], + "verbose": True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + ydl.download([url]) + + +# Command-line interface for local usage. +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Download a YouTube video's audio as WAV using yt-dlp with debugging output." + ) + parser.add_argument("url", help="The URL of the YouTube video to download.") + parser.add_argument( + "--output", + default="downloads", + help="Custom output directory (default: 'downloads').", + ) + args = parser.parse_args() + download_youtube_audio(args.url, args.output) + + +# gyatt dyum made by NeoDev diff --git a/install.bat b/install.bat new file mode 100644 index 00000000..ccec91bb --- /dev/null +++ b/install.bat @@ -0,0 +1,87 @@ +@echo off +setlocal enabledelayedexpansion +title RVC CLI Installer + +echo Welcome to the RVC CLI Installer! +echo. + +set "INSTALL_DIR=%cd%" +set "MINICONDA_DIR=%UserProfile%\Miniconda3" +set "ENV_DIR=%INSTALL_DIR%\env" +set "MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Windows-x86_64.exe" +set "CONDA_EXE=%MINICONDA_DIR%\Scripts\conda.exe" + +call :cleanup +call :install_miniconda +call :create_conda_env +call :install_dependencies + +echo RVC CLI has been installed successfully! +echo. +pause +exit /b 0 + +:cleanup +echo Cleaning up unnecessary files... +for %%F in (Makefile Dockerfile docker-compose.yaml *.sh) do if exist "%%F" del "%%F" +echo Cleanup complete. +echo. +exit /b 0 + +:install_miniconda +if exist "%CONDA_EXE%" ( + echo Miniconda already installed. Skipping installation. + exit /b 0 +) + +echo Miniconda not found. Starting download and installation... +powershell -Command "& {Invoke-WebRequest -Uri '%MINICONDA_URL%' -OutFile 'miniconda.exe'}" +if not exist "miniconda.exe" goto :download_error + +start /wait "" miniconda.exe /InstallationType=JustMe /RegisterPython=0 /S /D=%MINICONDA_DIR% +if errorlevel 1 goto :install_error + +del miniconda.exe +echo Miniconda installation complete. +echo. +exit /b 0 + +:create_conda_env +echo Creating Conda environment... +call "%MINICONDA_DIR%\_conda.exe" create --no-shortcuts -y -k --prefix "%ENV_DIR%" python=3.9 +if errorlevel 1 goto :error +echo Conda environment created successfully. +echo. + +if exist "%ENV_DIR%\python.exe" ( + echo Installing specific pip version... + "%ENV_DIR%\python.exe" -m pip install "pip<24.1" + if errorlevel 1 goto :error + echo Pip installation complete. + echo. +) +exit /b 0 + +:install_dependencies +echo Installing dependencies... +call "%MINICONDA_DIR%\condabin\conda.bat" activate "%ENV_DIR%" || goto :error +pip install --upgrade setuptools || goto :error +pip install --no-cache-dir -r "%INSTALL_DIR%\requirements.txt" || goto :error +pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --upgrade --index-url https://download.pytorch.org/whl/cu121 || goto :error +call "%MINICONDA_DIR%\condabin\conda.bat" deactivate +echo Dependencies installation complete. +echo. +exit /b 0 + +:download_error +echo Download failed. Please check your internet connection and try again. +goto :error + +:install_error +echo Miniconda installation failed. +goto :error + +:error +echo An error occurred during installation. Please check the output above for details. +pause +exit /b 1 diff --git a/lib/infer.py b/lib/infer.py deleted file mode 100644 index 1f8e0fb2..00000000 --- a/lib/infer.py +++ /dev/null @@ -1,221 +0,0 @@ -import os -import shutil -import gc -import torch -from multiprocessing import cpu_count -from lib.modules import VC -from lib.split_audio import split_silence_nonsilent, adjust_audio_lengths, combine_silence_nonsilent - -class Configs: - def __init__(self, device, is_half): - self.device = device - self.is_half = is_half - self.n_cpu = 0 - self.gpu_name = None - self.gpu_mem = None - self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() - - def device_config(self) -> tuple: - if torch.cuda.is_available(): - i_device = int(self.device.split(":")[-1]) - self.gpu_name = torch.cuda.get_device_name(i_device) - #if ( -# ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) -# or "P40" in self.gpu_name.upper() -# or "1060" in self.gpu_name -# or "1070" in self.gpu_name -# or "1080" in self.gpu_name -# ): -# print("16 series/10 series P40 forced single precision") -# self.is_half = False -# for config_file in ["32k.json", "40k.json", "48k.json"]: -# with open(BASE_DIR / "src" / "configs" / config_file, "r") as f: -# strr = f.read().replace("true", "false") -# with open(BASE_DIR / "src" / "configs" / config_file, "w") as f: -# f.write(strr) -# with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "r") as f: -# strr = f.read().replace("3.7", "3.0") -# with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "w") as f: -# f.write(strr) -# else: -# self.gpu_name = None -# self.gpu_mem = int( -# torch.cuda.get_device_properties(i_device).total_memory -# / 1024 -# / 1024 -# / 1024 -# + 0.4 -# ) -# if self.gpu_mem <= 4: -# with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "r") as f: -# strr = f.read().replace("3.7", "3.0") -# with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "w") as f: -# f.write(strr) - elif torch.backends.mps.is_available(): - print("No supported N-card found, use MPS for inference") - self.device = "mps" - else: - print("No supported N-card found, use CPU for inference") - self.device = "cpu" - - if self.n_cpu == 0: - self.n_cpu = cpu_count() - - if self.is_half: - # 6G memory config - x_pad = 3 - x_query = 10 - x_center = 60 - x_max = 65 - else: - # 5G memory config - x_pad = 1 - x_query = 6 - x_center = 38 - x_max = 41 - - if self.gpu_mem != None and self.gpu_mem <= 4: - x_pad = 1 - x_query = 5 - x_center = 30 - x_max = 32 - - return x_pad, x_query, x_center, x_max - -def get_model(voice_model): - model_dir = os.path.join(os.getcwd(), "models", voice_model) - model_filename, index_filename = None, None - for file in os.listdir(model_dir): - ext = os.path.splitext(file)[1] - if ext == '.pth': - model_filename = file - if ext == '.index': - index_filename = file - - if model_filename is None: - print(f'No model file exists in {models_dir}.') - return None, None - - return os.path.join(model_dir, model_filename), os.path.join(model_dir, index_filename) if index_filename else '' - -def infer_audio( - model_name, - audio_path, - f0_change=0, - f0_method="rmvpe+", - min_pitch="50", - max_pitch="1100", - crepe_hop_length=128, - index_rate=0.75, - filter_radius=3, - rms_mix_rate=0.25, - protect=0.33, - split_infer=False, - min_silence=500, - silence_threshold=-50, - seek_step=1, - keep_silence=100, - do_formant=False, - quefrency=0, - timbre=1, - f0_autotune=False, - audio_format="wav", - resample_sr=0, - hubert_model_path="assets/hubert/hubert_base.pt", - rmvpe_model_path="assets/rmvpe/rmvpe.pt", - fcpe_model_path="assets/fcpe/fcpe.pt" - ): - os.environ["rmvpe_model_path"] = rmvpe_model_path - os.environ["fcpe_model_path"] = fcpe_model_path - configs = Configs('cuda:0', True) - vc = VC(configs) - pth_path, index_path = get_model(model_name) - vc_data = vc.get_vc(pth_path, protect, 0.5) - - if split_infer: - inferred_files = [] - temp_dir = os.path.join(os.getcwd(), "seperate", "temp") - os.makedirs(temp_dir, exist_ok=True) - print("Splitting audio to silence and nonsilent segments.") - silence_files, nonsilent_files = split_silence_nonsilent(audio_path, min_silence, silence_threshold, seek_step, keep_silence) - print(f"Total silence segments: {len(silence_files)}.\nTotal nonsilent segments: {len(nonsilent_files)}.") - for i, nonsilent_file in enumerate(nonsilent_files): - print(f"Inferring nonsilent audio {i+1}") - inference_info, audio_data, output_path = vc.vc_single( - 0, - nonsilent_file, - f0_change, - f0_method, - index_path, - index_path, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, - audio_format, - crepe_hop_length, - do_formant, - quefrency, - timbre, - min_pitch, - max_pitch, - f0_autotune, - hubert_model_path - ) - if inference_info[0] == "Success.": - print("Inference ran successfully.") - print(inference_info[1]) - print("Times:\nnpy: %.2fs f0: %.2fs infer: %.2fs\nTotal time: %.2fs" % (*inference_info[2],)) - else: - print(f"An error occurred while processing.\n{inference_info[0]}") - return None - inferred_files.append(output_path) - print("Adjusting inferred audio lengths.") - adjusted_inferred_files = adjust_audio_lengths(nonsilent_files, inferred_files) - print("Combining silence and inferred audios.") - output_count = 1 - while True: - output_path = os.path.join(os.getcwd(), "output", f"{os.path.splitext(os.path.basename(audio_path))[0]}{model_name}{f0_method.capitalize()}_{output_count}.{audio_format}") - if not os.path.exists(output_path): - break - output_count += 1 - output_path = combine_silence_nonsilent(silence_files, adjusted_inferred_files, keep_silence, output_path) - [shutil.move(inferred_file, temp_dir) for inferred_file in inferred_files] - shutil.rmtree(temp_dir) - else: - inference_info, audio_data, output_path = vc.vc_single( - 0, - audio_path, - f0_change, - f0_method, - index_path, - index_path, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, - audio_format, - crepe_hop_length, - do_formant, - quefrency, - timbre, - min_pitch, - max_pitch, - f0_autotune, - hubert_model_path - ) - if inference_info[0] == "Success.": - print("Inference ran successfully.") - print(inference_info[1]) - print("Times:\nnpy: %.2fs f0: %.2fs infer: %.2fs\nTotal time: %.2fs" % (*inference_info[2],)) - else: - print(f"An error occurred while processing.\n{inference_info[0]}") - del configs, vc - gc.collect() - return inference_info[0] - - del configs, vc - gc.collect() - return output_path \ No newline at end of file diff --git a/lib/infer_libs/audio.py b/lib/infer_libs/audio.py deleted file mode 100644 index 5c831766..00000000 --- a/lib/infer_libs/audio.py +++ /dev/null @@ -1,87 +0,0 @@ -import numpy as np -import av -import ffmpeg -import os -import traceback -import sys -import subprocess - -platform_stft_mapping = { - 'linux': os.path.join(os.getcwd(), 'stftpitchshift'), - 'darwin': os.path.join(os.getcwd(), 'stftpitchshift'), - 'win32': os.path.join(os.getcwd(), 'stftpitchshift.exe'), -} - -stft = platform_stft_mapping.get(sys.platform) - -def wav2(i, o, format): - inp = av.open(i, 'rb') - if format == "m4a": format = "mp4" - out = av.open(o, 'wb', format=format) - if format == "ogg": format = "libvorbis" - if format == "mp4": format = "aac" - - ostream = out.add_stream(format) - - for frame in inp.decode(audio=0): - for p in ostream.encode(frame): out.mux(p) - - for p in ostream.encode(None): out.mux(p) - - out.close() - inp.close() - -def load_audio(file, sr, DoFormant=False, Quefrency=1.0, Timbre=1.0): - formanted = False - file = file.strip(' \n"') - if not os.path.exists(file): - raise RuntimeError( - "Wrong audio path, that does not exist." - ) - - try: - if DoFormant: - print("Starting formant shift. Please wait as this process takes a while.") - formanted_file = f"{os.path.splitext(os.path.basename(file))[0]}_formanted{os.path.splitext(os.path.basename(file))[1]}" - command = ( - f'{stft} -i "{file}" -q "{Quefrency}" ' - f'-t "{Timbre}" -o "{formanted_file}"' - ) - subprocess.run(command, shell=True) - file = formanted_file - print(f"Formanted {file}\n") - - # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - file = ( - file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) # Prevent small white copy path head and tail with spaces and " and return - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) - - return np.frombuffer(out, np.float32).flatten() - - except Exception as e: - raise RuntimeError(f"Failed to load audio: {e}") - -def check_audio_duration(file): - try: - file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - - probe = ffmpeg.probe(file) - - duration = float(probe['streams'][0]['duration']) - - if duration < 0.76: - print( - f"Audio file, {file.split('/')[-1]}, under ~0.76s detected - file is too short. Target at least 1-2s for best results." - ) - return False - - return True - except Exception as e: - raise RuntimeError(f"Failed to check audio duration: {e}") \ No newline at end of file diff --git a/lib/infer_libs/fcpe.py b/lib/infer_libs/fcpe.py deleted file mode 100644 index ddffd33e..00000000 --- a/lib/infer_libs/fcpe.py +++ /dev/null @@ -1,873 +0,0 @@ -from typing import Union - -import torch.nn.functional as F -import numpy as np -import torch -import torch.nn as nn -from torch.nn.utils import weight_norm -from torchaudio.transforms import Resample -import os -import librosa -import soundfile as sf -import torch.utils.data -from librosa.filters import mel as librosa_mel_fn -import math -from functools import partial - -from einops import rearrange, repeat -from local_attention import LocalAttention -from torch import nn - -os.environ["LRU_CACHE_CAPACITY"] = "3" - -def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): - sampling_rate = None - try: - data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile. - except Exception as ex: - print(f"'{full_path}' failed to load.\nException:") - print(ex) - if return_empty_on_exception: - return [], sampling_rate or target_sr or 48000 - else: - raise Exception(ex) - - if len(data.shape) > 1: - data = data[:, 0] - assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension) - - if np.issubdtype(data.dtype, np.integer): # if audio data is type int - max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX - else: # if audio data is type fp32 - max_mag = max(np.amax(data), -np.amin(data)) - max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32 - - data = torch.FloatTensor(data.astype(np.float32))/max_mag - - if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except - return [], sampling_rate or target_sr or 48000 - if target_sr is not None and sampling_rate != target_sr: - data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr)) - sampling_rate = target_sr - - return data, sampling_rate - -def dynamic_range_compression(x, C=1, clip_val=1e-5): - return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) - -def dynamic_range_decompression(x, C=1): - return np.exp(x) / C - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - return torch.log(torch.clamp(x, min=clip_val) * C) - -def dynamic_range_decompression_torch(x, C=1): - return torch.exp(x) / C - -class STFT(): - def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5): - self.target_sr = sr - - self.n_mels = n_mels - self.n_fft = n_fft - self.win_size = win_size - self.hop_length = hop_length - self.fmin = fmin - self.fmax = fmax - self.clip_val = clip_val - self.mel_basis = {} - self.hann_window = {} - - def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): - sampling_rate = self.target_sr - n_mels = self.n_mels - n_fft = self.n_fft - win_size = self.win_size - hop_length = self.hop_length - fmin = self.fmin - fmax = self.fmax - clip_val = self.clip_val - - factor = 2 ** (keyshift / 12) - n_fft_new = int(np.round(n_fft * factor)) - win_size_new = int(np.round(win_size * factor)) - hop_length_new = int(np.round(hop_length * speed)) - if not train: - mel_basis = self.mel_basis - hann_window = self.hann_window - else: - mel_basis = {} - hann_window = {} - - if torch.min(y) < -1.: - print('min value is ', torch.min(y)) - if torch.max(y) > 1.: - print('max value is ', torch.max(y)) - - mel_basis_key = str(fmax)+'_'+str(y.device) - if mel_basis_key not in mel_basis: - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) - mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device) - - keyshift_key = str(keyshift)+'_'+str(y.device) - if keyshift_key not in hann_window: - hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device) - - pad_left = (win_size_new - hop_length_new) //2 - pad_right = max((win_size_new- hop_length_new + 1) //2, win_size_new - y.size(-1) - pad_left) - if pad_right < y.size(-1): - mode = 'reflect' - else: - mode = 'constant' - y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode = mode) - y = y.squeeze(1) - - spec = torch.stft(y, n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key], - center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) - spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9)) - if keyshift != 0: - size = n_fft // 2 + 1 - resize = spec.size(1) - if resize < size: - spec = F.pad(spec, (0, 0, 0, size-resize)) - spec = spec[:, :size, :] * win_size / win_size_new - spec = torch.matmul(mel_basis[mel_basis_key], spec) - spec = dynamic_range_compression_torch(spec, clip_val=clip_val) - return spec - - def __call__(self, audiopath): - audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) - spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) - return spect - -stft = STFT() - -#import fast_transformers.causal_product.causal_product_cuda - -def softmax_kernel(data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device = None): - b, h, *_ = data.shape - # (batch size, head, length, model_dim) - - # normalize model dim - data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1. - - # what is ration?, projection_matrix.shape[0] --> 266 - - ratio = (projection_matrix.shape[0] ** -0.5) - - projection = repeat(projection_matrix, 'j d -> b h j d', b = b, h = h) - projection = projection.type_as(data) - - #data_dash = w^T x - data_dash = torch.einsum('...id,...jd->...ij', (data_normalizer * data), projection) - - - # diag_data = D**2 - diag_data = data ** 2 - diag_data = torch.sum(diag_data, dim=-1) - diag_data = (diag_data / 2.0) * (data_normalizer ** 2) - diag_data = diag_data.unsqueeze(dim=-1) - - #print () - if is_query: - data_dash = ratio * ( - torch.exp(data_dash - diag_data - - torch.max(data_dash, dim=-1, keepdim=True).values) + eps) - else: - data_dash = ratio * ( - torch.exp(data_dash - diag_data + eps))#- torch.max(data_dash)) + eps) - - return data_dash.type_as(data) - -def orthogonal_matrix_chunk(cols, qr_uniform_q = False, device = None): - unstructured_block = torch.randn((cols, cols), device = device) - q, r = torch.linalg.qr(unstructured_block.cpu(), mode='reduced') - q, r = map(lambda t: t.to(device), (q, r)) - - # proposed by @Parskatt - # to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf - if qr_uniform_q: - d = torch.diag(r, 0) - q *= d.sign() - return q.t() -def exists(val): - return val is not None - -def empty(tensor): - return tensor.numel() == 0 - -def default(val, d): - return val if exists(val) else d - -def cast_tuple(val): - return (val,) if not isinstance(val, tuple) else val - -class PCmer(nn.Module): - """The encoder that is used in the Transformer model.""" - - def __init__(self, - num_layers, - num_heads, - dim_model, - dim_keys, - dim_values, - residual_dropout, - attention_dropout): - super().__init__() - self.num_layers = num_layers - self.num_heads = num_heads - self.dim_model = dim_model - self.dim_values = dim_values - self.dim_keys = dim_keys - self.residual_dropout = residual_dropout - self.attention_dropout = attention_dropout - - self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)]) - - # METHODS ######################################################################################################## - - def forward(self, phone, mask=None): - - # apply all layers to the input - for (i, layer) in enumerate(self._layers): - phone = layer(phone, mask) - # provide the final sequence - return phone - - -# ==================================================================================================================== # -# CLASS _ E N C O D E R L A Y E R # -# ==================================================================================================================== # - - -class _EncoderLayer(nn.Module): - """One layer of the encoder. - - Attributes: - attn: (:class:`mha.MultiHeadAttention`): The attention mechanism that is used to read the input sequence. - feed_forward (:class:`ffl.FeedForwardLayer`): The feed-forward layer on top of the attention mechanism. - """ - - def __init__(self, parent: PCmer): - """Creates a new instance of ``_EncoderLayer``. - - Args: - parent (Encoder): The encoder that the layers is created for. - """ - super().__init__() - - - self.conformer = ConformerConvModule(parent.dim_model) - self.norm = nn.LayerNorm(parent.dim_model) - self.dropout = nn.Dropout(parent.residual_dropout) - - # selfatt -> fastatt: performer! - self.attn = SelfAttention(dim = parent.dim_model, - heads = parent.num_heads, - causal = False) - - # METHODS ######################################################################################################## - - def forward(self, phone, mask=None): - - # compute attention sub-layer - phone = phone + (self.attn(self.norm(phone), mask=mask)) - - phone = phone + (self.conformer(phone)) - - return phone - -def calc_same_padding(kernel_size): - pad = kernel_size // 2 - return (pad, pad - (kernel_size + 1) % 2) - -# helper classes - -class Swish(nn.Module): - def forward(self, x): - return x * x.sigmoid() - -class Transpose(nn.Module): - def __init__(self, dims): - super().__init__() - assert len(dims) == 2, 'dims must be a tuple of two dimensions' - self.dims = dims - - def forward(self, x): - return x.transpose(*self.dims) - -class GLU(nn.Module): - def __init__(self, dim): - super().__init__() - self.dim = dim - - def forward(self, x): - out, gate = x.chunk(2, dim=self.dim) - return out * gate.sigmoid() - -class DepthWiseConv1d(nn.Module): - def __init__(self, chan_in, chan_out, kernel_size, padding): - super().__init__() - self.padding = padding - self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in) - - def forward(self, x): - x = F.pad(x, self.padding) - return self.conv(x) - -class ConformerConvModule(nn.Module): - def __init__( - self, - dim, - causal = False, - expansion_factor = 2, - kernel_size = 31, - dropout = 0.): - super().__init__() - - inner_dim = dim * expansion_factor - padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0) - - self.net = nn.Sequential( - nn.LayerNorm(dim), - Transpose((1, 2)), - nn.Conv1d(dim, inner_dim * 2, 1), - GLU(dim=1), - DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding), - #nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(), - Swish(), - nn.Conv1d(inner_dim, dim, 1), - Transpose((1, 2)), - nn.Dropout(dropout) - ) - - def forward(self, x): - return self.net(x) - -def linear_attention(q, k, v): - if v is None: - #print (k.size(), q.size()) - out = torch.einsum('...ed,...nd->...ne', k, q) - return out - - else: - k_cumsum = k.sum(dim = -2) - #k_cumsum = k.sum(dim = -2) - D_inv = 1. / (torch.einsum('...nd,...d->...n', q, k_cumsum.type_as(q)) + 1e-8) - - context = torch.einsum('...nd,...ne->...de', k, v) - #print ("TRUEEE: ", context.size(), q.size(), D_inv.size()) - out = torch.einsum('...de,...nd,...n->...ne', context, q, D_inv) - return out - -def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling = 0, qr_uniform_q = False, device = None): - nb_full_blocks = int(nb_rows / nb_columns) - #print (nb_full_blocks) - block_list = [] - - for _ in range(nb_full_blocks): - q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device) - block_list.append(q) - # block_list[n] is a orthogonal matrix ... (model_dim * model_dim) - #print (block_list[0].size(), torch.einsum('...nd,...nd->...n', block_list[0], torch.roll(block_list[0],1,1))) - #print (nb_rows, nb_full_blocks, nb_columns) - remaining_rows = nb_rows - nb_full_blocks * nb_columns - #print (remaining_rows) - if remaining_rows > 0: - q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device) - #print (q[:remaining_rows].size()) - block_list.append(q[:remaining_rows]) - - final_matrix = torch.cat(block_list) - - if scaling == 0: - multiplier = torch.randn((nb_rows, nb_columns), device = device).norm(dim = 1) - elif scaling == 1: - multiplier = math.sqrt((float(nb_columns))) * torch.ones((nb_rows,), device = device) - else: - raise ValueError(f'Invalid scaling {scaling}') - - return torch.diag(multiplier) @ final_matrix - -class FastAttention(nn.Module): - def __init__(self, dim_heads, nb_features = None, ortho_scaling = 0, causal = False, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, no_projection = False): - super().__init__() - nb_features = default(nb_features, int(dim_heads * math.log(dim_heads))) - - self.dim_heads = dim_heads - self.nb_features = nb_features - self.ortho_scaling = ortho_scaling - - self.create_projection = partial(gaussian_orthogonal_random_matrix, nb_rows = self.nb_features, nb_columns = dim_heads, scaling = ortho_scaling, qr_uniform_q = qr_uniform_q) - projection_matrix = self.create_projection() - self.register_buffer('projection_matrix', projection_matrix) - - self.generalized_attention = generalized_attention - self.kernel_fn = kernel_fn - - # if this is turned on, no projection will be used - # queries and keys will be softmax-ed as in the original efficient attention paper - self.no_projection = no_projection - - self.causal = causal - - @torch.no_grad() - def redraw_projection_matrix(self): - projections = self.create_projection() - self.projection_matrix.copy_(projections) - del projections - - def forward(self, q, k, v): - device = q.device - - if self.no_projection: - q = q.softmax(dim = -1) - k = torch.exp(k) if self.causal else k.softmax(dim = -2) - else: - create_kernel = partial(softmax_kernel, projection_matrix = self.projection_matrix, device = device) - - q = create_kernel(q, is_query = True) - k = create_kernel(k, is_query = False) - - attn_fn = linear_attention if not self.causal else self.causal_linear_fn - if v is None: - out = attn_fn(q, k, None) - return out - else: - out = attn_fn(q, k, v) - return out -class SelfAttention(nn.Module): - def __init__(self, dim, causal = False, heads = 8, dim_head = 64, local_heads = 0, local_window_size = 256, nb_features = None, feature_redraw_interval = 1000, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, dropout = 0., no_projection = False): - super().__init__() - assert dim % heads == 0, 'dimension must be divisible by number of heads' - dim_head = default(dim_head, dim // heads) - inner_dim = dim_head * heads - self.fast_attention = FastAttention(dim_head, nb_features, causal = causal, generalized_attention = generalized_attention, kernel_fn = kernel_fn, qr_uniform_q = qr_uniform_q, no_projection = no_projection) - - self.heads = heads - self.global_heads = heads - local_heads - self.local_attn = LocalAttention(window_size = local_window_size, causal = causal, autopad = True, dropout = dropout, look_forward = int(not causal), rel_pos_emb_config = (dim_head, local_heads)) if local_heads > 0 else None - - #print (heads, nb_features, dim_head) - #name_embedding = torch.zeros(110, heads, dim_head, dim_head) - #self.name_embedding = nn.Parameter(name_embedding, requires_grad=True) - - - self.to_q = nn.Linear(dim, inner_dim) - self.to_k = nn.Linear(dim, inner_dim) - self.to_v = nn.Linear(dim, inner_dim) - self.to_out = nn.Linear(inner_dim, dim) - self.dropout = nn.Dropout(dropout) - - @torch.no_grad() - def redraw_projection_matrix(self): - self.fast_attention.redraw_projection_matrix() - #torch.nn.init.zeros_(self.name_embedding) - #print (torch.sum(self.name_embedding)) - def forward(self, x, context = None, mask = None, context_mask = None, name=None, inference=False, **kwargs): - _, _, _, h, gh = *x.shape, self.heads, self.global_heads - - cross_attend = exists(context) - - context = default(context, x) - context_mask = default(context_mask, mask) if not cross_attend else context_mask - #print (torch.sum(self.name_embedding)) - q, k, v = self.to_q(x), self.to_k(context), self.to_v(context) - - q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v)) - (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v)) - - attn_outs = [] - #print (name) - #print (self.name_embedding[name].size()) - if not empty(q): - if exists(context_mask): - global_mask = context_mask[:, None, :, None] - v.masked_fill_(~global_mask, 0.) - if cross_attend: - pass - #print (torch.sum(self.name_embedding)) - #out = self.fast_attention(q,self.name_embedding[name],None) - #print (torch.sum(self.name_embedding[...,-1:])) - else: - out = self.fast_attention(q, k, v) - attn_outs.append(out) - - if not empty(lq): - assert not cross_attend, 'local attention is not compatible with cross attention' - out = self.local_attn(lq, lk, lv, input_mask = mask) - attn_outs.append(out) - - out = torch.cat(attn_outs, dim = 1) - out = rearrange(out, 'b h n d -> b n (h d)') - out = self.to_out(out) - return self.dropout(out) - -def l2_regularization(model, l2_alpha): - l2_loss = [] - for module in model.modules(): - if type(module) is nn.Conv2d: - l2_loss.append((module.weight ** 2).sum() / 2.0) - return l2_alpha * sum(l2_loss) - - -class FCPEModel(nn.Module): - def __init__( - self, - input_channel=128, - out_dims=360, - n_layers=12, - n_chans=512, - use_siren=False, - use_full=False, - loss_mse_scale=10, - loss_l2_regularization=False, - loss_l2_regularization_scale=1, - loss_grad1_mse=False, - loss_grad1_mse_scale=1, - f0_max=1975.5, - f0_min=32.70, - confidence=False, - threshold=0.05, - use_input_conv=True - ): - super().__init__() - if use_siren is True: - raise ValueError("Siren is not supported yet.") - if use_full is True: - raise ValueError("Full model is not supported yet.") - - self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10 - self.loss_l2_regularization = loss_l2_regularization if (loss_l2_regularization is not None) else False - self.loss_l2_regularization_scale = loss_l2_regularization_scale if (loss_l2_regularization_scale - is not None) else 1 - self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False - self.loss_grad1_mse_scale = loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1 - self.f0_max = f0_max if (f0_max is not None) else 1975.5 - self.f0_min = f0_min if (f0_min is not None) else 32.70 - self.confidence = confidence if (confidence is not None) else False - self.threshold = threshold if (threshold is not None) else 0.05 - self.use_input_conv = use_input_conv if (use_input_conv is not None) else True - - self.cent_table_b = torch.Tensor( - np.linspace(self.f0_to_cent(torch.Tensor([f0_min]))[0], self.f0_to_cent(torch.Tensor([f0_max]))[0], - out_dims)) - self.register_buffer("cent_table", self.cent_table_b) - - # conv in stack - _leaky = nn.LeakyReLU() - self.stack = nn.Sequential( - nn.Conv1d(input_channel, n_chans, 3, 1, 1), - nn.GroupNorm(4, n_chans), - _leaky, - nn.Conv1d(n_chans, n_chans, 3, 1, 1)) - - # transformer - self.decoder = PCmer( - num_layers=n_layers, - num_heads=8, - dim_model=n_chans, - dim_keys=n_chans, - dim_values=n_chans, - residual_dropout=0.1, - attention_dropout=0.1) - self.norm = nn.LayerNorm(n_chans) - - # out - self.n_out = out_dims - self.dense_out = weight_norm( - nn.Linear(n_chans, self.n_out)) - - def forward(self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder = "local_argmax"): - """ - input: - B x n_frames x n_unit - return: - dict of B x n_frames x feat - """ - if cdecoder == "argmax": - self.cdecoder = self.cents_decoder - elif cdecoder == "local_argmax": - self.cdecoder = self.cents_local_decoder - if self.use_input_conv: - x = self.stack(mel.transpose(1, 2)).transpose(1, 2) - else: - x = mel - x = self.decoder(x) - x = self.norm(x) - x = self.dense_out(x) # [B,N,D] - x = torch.sigmoid(x) - if not infer: - gt_cent_f0 = self.f0_to_cent(gt_f0) # mel f0 #[B,N,1] - gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0) # #[B,N,out_dim] - loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0) # bce loss - # l2 regularization - if self.loss_l2_regularization: - loss_all = loss_all + l2_regularization(model=self, l2_alpha=self.loss_l2_regularization_scale) - x = loss_all - if infer: - x = self.cdecoder(x) - x = self.cent_to_f0(x) - if not return_hz_f0: - x = (1 + x / 700).log() - return x - - def cents_decoder(self, y, mask=True): - B, N, _ = y.size() - ci = self.cent_table[None, None, :].expand(B, N, -1) - rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(y, dim=-1, keepdim=True) # cents: [B,N,1] - if mask: - confident = torch.max(y, dim=-1, keepdim=True)[0] - confident_mask = torch.ones_like(confident) - confident_mask[confident <= self.threshold] = float("-INF") - rtn = rtn * confident_mask - if self.confidence: - return rtn, confident - else: - return rtn - - def cents_local_decoder(self, y, mask=True): - B, N, _ = y.size() - ci = self.cent_table[None, None, :].expand(B, N, -1) - confident, max_index = torch.max(y, dim=-1, keepdim=True) - local_argmax_index = torch.arange(0,9).to(max_index.device) + (max_index - 4) - local_argmax_index[local_argmax_index<0] = 0 - local_argmax_index[local_argmax_index>=self.n_out] = self.n_out - 1 - ci_l = torch.gather(ci,-1,local_argmax_index) - y_l = torch.gather(y,-1,local_argmax_index) - rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(y_l, dim=-1, keepdim=True) # cents: [B,N,1] - if mask: - confident_mask = torch.ones_like(confident) - confident_mask[confident <= self.threshold] = float("-INF") - rtn = rtn * confident_mask - if self.confidence: - return rtn, confident - else: - return rtn - - def cent_to_f0(self, cent): - return 10. * 2 ** (cent / 1200.) - - def f0_to_cent(self, f0): - return 1200. * torch.log2(f0 / 10.) - - def gaussian_blurred_cent(self, cents): # cents: [B,N,1] - mask = (cents > 0.1) & (cents < (1200. * np.log2(self.f0_max / 10.))) - B, N, _ = cents.size() - ci = self.cent_table[None, None, :].expand(B, N, -1) - return torch.exp(-torch.square(ci - cents) / 1250) * mask.float() - - -class FCPEInfer: - def __init__(self, model_path, device=None, dtype=torch.float32): - if device is None: - device = 'cuda' if torch.cuda.is_available() else 'cpu' - self.device = device - ckpt = torch.load(model_path, map_location=torch.device(self.device)) - self.args = DotDict(ckpt["config"]) - self.dtype = dtype - model = FCPEModel( - input_channel=self.args.model.input_channel, - out_dims=self.args.model.out_dims, - n_layers=self.args.model.n_layers, - n_chans=self.args.model.n_chans, - use_siren=self.args.model.use_siren, - use_full=self.args.model.use_full, - loss_mse_scale=self.args.loss.loss_mse_scale, - loss_l2_regularization=self.args.loss.loss_l2_regularization, - loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale, - loss_grad1_mse=self.args.loss.loss_grad1_mse, - loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale, - f0_max=self.args.model.f0_max, - f0_min=self.args.model.f0_min, - confidence=self.args.model.confidence, - ) - model.to(self.device).to(self.dtype) - model.load_state_dict(ckpt['model']) - model.eval() - self.model = model - self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device) - - @torch.no_grad() - def __call__(self, audio, sr, threshold=0.05): - self.model.threshold = threshold - audio = audio[None,:] - mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype) - f0 = self.model(mel=mel, infer=True, return_hz_f0=True) - return f0 - - -class Wav2Mel: - - def __init__(self, args, device=None, dtype=torch.float32): - # self.args = args - self.sampling_rate = args.mel.sampling_rate - self.hop_size = args.mel.hop_size - if device is None: - device = 'cuda' if torch.cuda.is_available() else 'cpu' - self.device = device - self.dtype = dtype - self.stft = STFT( - args.mel.sampling_rate, - args.mel.num_mels, - args.mel.n_fft, - args.mel.win_size, - args.mel.hop_size, - args.mel.fmin, - args.mel.fmax - ) - self.resample_kernel = {} - - def extract_nvstft(self, audio, keyshift=0, train=False): - mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2) # B, n_frames, bins - return mel - - def extract_mel(self, audio, sample_rate, keyshift=0, train=False): - audio = audio.to(self.dtype).to(self.device) - # resample - if sample_rate == self.sampling_rate: - audio_res = audio - else: - key_str = str(sample_rate) - if key_str not in self.resample_kernel: - self.resample_kernel[key_str] = Resample(sample_rate, self.sampling_rate, lowpass_filter_width=128) - self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.dtype).to(self.device) - audio_res = self.resample_kernel[key_str](audio) - - # extract - mel = self.extract_nvstft(audio_res, keyshift=keyshift, train=train) # B, n_frames, bins - n_frames = int(audio.shape[1] // self.hop_size) + 1 - if n_frames > int(mel.shape[1]): - mel = torch.cat((mel, mel[:, -1:, :]), 1) - if n_frames < int(mel.shape[1]): - mel = mel[:, :n_frames, :] - return mel - - def __call__(self, audio, sample_rate, keyshift=0, train=False): - return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train) - - -class DotDict(dict): - def __getattr__(*args): - val = dict.get(*args) - return DotDict(val) if type(val) is dict else val - - __setattr__ = dict.__setitem__ - __delattr__ = dict.__delitem__ - -class F0Predictor(object): - def compute_f0(self,wav,p_len): - ''' - input: wav:[signal_length] - p_len:int - output: f0:[signal_length//hop_length] - ''' - pass - - def compute_f0_uv(self,wav,p_len): - ''' - input: wav:[signal_length] - p_len:int - output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] - ''' - pass - -class FCPE(F0Predictor): - def __init__(self, model_path, hop_length=512, f0_min=50, f0_max=1100, dtype=torch.float32, device=None, sampling_rate=44100, - threshold=0.05): - self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype) - self.hop_length = hop_length - self.f0_min = f0_min - self.f0_max = f0_max - if device is None: - self.device = 'cuda' if torch.cuda.is_available() else 'cpu' - else: - self.device = device - self.threshold = threshold - self.sampling_rate = sampling_rate - self.dtype = dtype - self.name = "fcpe" - - def repeat_expand( - self, content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest" - ): - ndim = content.ndim - - if content.ndim == 1: - content = content[None, None] - elif content.ndim == 2: - content = content[None] - - assert content.ndim == 3 - - is_np = isinstance(content, np.ndarray) - if is_np: - content = torch.from_numpy(content) - - results = torch.nn.functional.interpolate(content, size=target_len, mode=mode) - - if is_np: - results = results.numpy() - - if ndim == 1: - return results[0, 0] - elif ndim == 2: - return results[0] - - def post_process(self, x, sampling_rate, f0, pad_to): - if isinstance(f0, np.ndarray): - f0 = torch.from_numpy(f0).float().to(x.device) - - if pad_to is None: - return f0 - - f0 = self.repeat_expand(f0, pad_to) - - vuv_vector = torch.zeros_like(f0) - vuv_vector[f0 > 0.0] = 1.0 - vuv_vector[f0 <= 0.0] = 0.0 - - # 去掉0频率, 并线性插值 - nzindex = torch.nonzero(f0).squeeze() - f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() - time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy() - time_frame = np.arange(pad_to) * self.hop_length / sampling_rate - - vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] - - if f0.shape[0] <= 0: - return torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(), vuv_vector.cpu().numpy() - if f0.shape[0] == 1: - return (torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[ - 0]).cpu().numpy(), vuv_vector.cpu().numpy() - - # 大概可以用 torch 重写? - f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) - # vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0)) - - return f0, vuv_vector.cpu().numpy() - - def compute_f0(self, wav, p_len=None): - x = torch.FloatTensor(wav).to(self.dtype).to(self.device) - if p_len is None: - print("fcpe p_len is None") - p_len = x.shape[0] // self.hop_length - #else: -# assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0] - if torch.all(f0 == 0): - rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) - return rtn, rtn - return self.post_process(x, self.sampling_rate, f0, p_len)[0] - - def compute_f0_uv(self, wav, p_len=None): - x = torch.FloatTensor(wav).to(self.dtype).to(self.device) - if p_len is None: - p_len = x.shape[0] // self.hop_length - #else: -# assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0] - if torch.all(f0 == 0): - rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) - return rtn, rtn - return self.post_process(x, self.sampling_rate, f0, p_len) \ No newline at end of file diff --git a/lib/infer_libs/infer_pack/attentions.py b/lib/infer_libs/infer_pack/attentions.py deleted file mode 100644 index 94d61c89..00000000 --- a/lib/infer_libs/infer_pack/attentions.py +++ /dev/null @@ -1,414 +0,0 @@ -import math -import torch -from torch import nn -from torch.nn import functional as F - -from lib.infer_libs.infer_pack import commons -from lib.infer_libs.infer_pack.modules import LayerNorm - - -class Encoder(nn.Module): - def __init__( - self, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size=1, - p_dropout=0.0, - window_size=10, - **kwargs - ): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append( - MultiHeadAttention( - hidden_channels, - hidden_channels, - n_heads, - p_dropout=p_dropout, - window_size=window_size, - ) - ) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append( - FFN( - hidden_channels, - hidden_channels, - filter_channels, - kernel_size, - p_dropout=p_dropout, - ) - ) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask): - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.attn_layers[i](x, x, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x - - -class Decoder(nn.Module): - def __init__( - self, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size=1, - p_dropout=0.0, - proximal_bias=False, - proximal_init=True, - **kwargs - ): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - - self.drop = nn.Dropout(p_dropout) - self.self_attn_layers = nn.ModuleList() - self.norm_layers_0 = nn.ModuleList() - self.encdec_attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.self_attn_layers.append( - MultiHeadAttention( - hidden_channels, - hidden_channels, - n_heads, - p_dropout=p_dropout, - proximal_bias=proximal_bias, - proximal_init=proximal_init, - ) - ) - self.norm_layers_0.append(LayerNorm(hidden_channels)) - self.encdec_attn_layers.append( - MultiHeadAttention( - hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout - ) - ) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append( - FFN( - hidden_channels, - hidden_channels, - filter_channels, - kernel_size, - p_dropout=p_dropout, - causal=True, - ) - ) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask, h, h_mask): - """ - x: decoder input - h: encoder output - """ - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( - device=x.device, dtype=x.dtype - ) - encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.self_attn_layers[i](x, x, self_attn_mask) - y = self.drop(y) - x = self.norm_layers_0[i](x + y) - - y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x - - -class MultiHeadAttention(nn.Module): - def __init__( - self, - channels, - out_channels, - n_heads, - p_dropout=0.0, - window_size=None, - heads_share=True, - block_length=None, - proximal_bias=False, - proximal_init=False, - ): - super().__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.p_dropout = p_dropout - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) - - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - self.emb_rel_v = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - nn.init.xavier_uniform_(self.conv_v.weight) - if proximal_init: - with torch.no_grad(): - self.conv_k.weight.copy_(self.conv_q.weight) - self.conv_k.bias.copy_(self.conv_q.bias) - - def forward(self, x, c, attn_mask=None): - q = self.conv_q(x) - k = self.conv_k(c) - v = self.conv_v(c) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x) - return x - - def attention(self, query, key, value, mask=None): - # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) - if self.window_size is not None: - assert ( - t_s == t_t - ), "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query / math.sqrt(self.k_channels), key_relative_embeddings - ) - scores_local = self._relative_position_to_absolute_position(rel_logits) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype - ) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - assert ( - t_s == t_t - ), "Local attention is only available for self-attention." - block_mask = ( - torch.ones_like(scores) - .triu(-self.block_length) - .tril(self.block_length) - ) - scores = scores.masked_fill(block_mask == 0, -1e4) - p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position(p_attn) - value_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_v, t_s - ) - output = output + self._matmul_with_relative_values( - relative_weights, value_relative_embeddings - ) - output = ( - output.transpose(2, 3).contiguous().view(b, d, t_t) - ) # [b, n_h, t_t, d_k] -> [b, d, t_t] - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - """ - x: [b, h, l, m] - y: [h or 1, m, d] - ret: [b, h, l, d] - """ - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - """ - x: [b, h, l, d] - y: [h or 1, m, d] - ret: [b, h, l, m] - """ - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), - ) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[ - :, slice_start_position:slice_end_position - ] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - """ - x: [b, h, l, 2*l-1] - ret: [b, h, l, l] - """ - batch, heads, length, _ = x.size() - # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) - - # Concat extra elements so to add up to shape (len+1, 2*len-1). - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad( - x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) - ) - - # Reshape and slice out the padded elements. - x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ - :, :, :length, length - 1 : - ] - return x_final - - def _absolute_position_to_relative_position(self, x): - """ - x: [b, h, l, l] - ret: [b, h, l, 2*l-1] - """ - batch, heads, length, _ = x.size() - # padd along column - x = F.pad( - x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) - ) - x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) - # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] - return x_final - - def _attention_bias_proximal(self, length): - """Bias for self-attention to encourage attention to close positions. - Args: - length: an integer scalar. - Returns: - a Tensor with shape [1, 1, length, length] - """ - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) - - -class FFN(nn.Module): - def __init__( - self, - in_channels, - out_channels, - filter_channels, - kernel_size, - p_dropout=0.0, - activation=None, - causal=False, - ): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - self.causal = causal - - if causal: - self.padding = self._causal_padding - else: - self.padding = self._same_padding - - self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) - self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask): - x = self.conv_1(self.padding(x * x_mask)) - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - x = self.conv_2(self.padding(x * x_mask)) - return x * x_mask - - def _causal_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = self.kernel_size - 1 - pad_r = 0 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x - - def _same_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = (self.kernel_size - 1) // 2 - pad_r = self.kernel_size // 2 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x diff --git a/lib/infer_libs/infer_pack/commons.py b/lib/infer_libs/infer_pack/commons.py deleted file mode 100644 index 2618e3ad..00000000 --- a/lib/infer_libs/infer_pack/commons.py +++ /dev/null @@ -1,164 +0,0 @@ -import math -import torch -from torch.nn import functional as F - - -def init_weights(m, mean=0.0, std=0.01): - classname = m.__class__.__name__ - if classname.find("Conv") != -1: - m.weight.data.normal_(mean, std) - - -def get_padding(kernel_size, dilation=1): - return int((kernel_size * dilation - dilation) / 2) - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -def kl_divergence(m_p, logs_p, m_q, logs_q): - """KL(P||Q)""" - kl = (logs_q - logs_p) - 0.5 - kl += ( - 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) - ) - return kl - - -def rand_gumbel(shape): - """Sample from the Gumbel distribution, protect from overflows.""" - uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 - return -torch.log(-torch.log(uniform_samples)) - - -def rand_gumbel_like(x): - g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) - return g - - -def slice_segments(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :, :segment_size]) - for i in range(x.size(0)): - idx_str = ids_str[i] - idx_end = idx_str + segment_size - ret[i] = x[i, :, idx_str:idx_end] - return ret - - -def slice_segments2(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :segment_size]) - for i in range(x.size(0)): - idx_str = ids_str[i] - idx_end = idx_str + segment_size - ret[i] = x[i, idx_str:idx_end] - return ret - - -def rand_slice_segments(x, x_lengths=None, segment_size=4): - b, d, t = x.size() - if x_lengths is None: - x_lengths = t - ids_str_max = x_lengths - segment_size + 1 - ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) - ret = slice_segments(x, ids_str, segment_size) - return ret, ids_str - - -def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): - position = torch.arange(length, dtype=torch.float) - num_timescales = channels // 2 - log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( - num_timescales - 1 - ) - inv_timescales = min_timescale * torch.exp( - torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment - ) - scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) - signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) - signal = F.pad(signal, [0, 0, 0, channels % 2]) - signal = signal.view(1, channels, length) - return signal - - -def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return x + signal.to(dtype=x.dtype, device=x.device) - - -def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) - - -def subsequent_mask(length): - mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) - return mask - - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -def shift_1d(x): - x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] - return x - - -def sequence_mask(length, max_length=None): - if max_length is None: - max_length = length.max() - x = torch.arange(max_length, dtype=length.dtype, device=length.device) - return x.unsqueeze(0) < length.unsqueeze(1) - - -def generate_path(duration, mask): - """ - duration: [b, 1, t_x] - mask: [b, 1, t_y, t_x] - """ - device = duration.device - - b, _, t_y, t_x = mask.shape - cum_duration = torch.cumsum(duration, -1) - - cum_duration_flat = cum_duration.view(b * t_x) - path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) - path = path.view(b, t_x, t_y) - path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path.unsqueeze(1).transpose(2, 3) * mask - return path - - -def clip_grad_value_(parameters, clip_value, norm_type=2): - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - norm_type = float(norm_type) - if clip_value is not None: - clip_value = float(clip_value) - - total_norm = 0 - for p in parameters: - param_norm = p.grad.data.norm(norm_type) - total_norm += param_norm.item() ** norm_type - if clip_value is not None: - p.grad.data.clamp_(min=-clip_value, max=clip_value) - total_norm = total_norm ** (1.0 / norm_type) - return total_norm diff --git a/lib/infer_libs/infer_pack/models.py b/lib/infer_libs/infer_pack/models.py deleted file mode 100644 index 06f58a90..00000000 --- a/lib/infer_libs/infer_pack/models.py +++ /dev/null @@ -1,1174 +0,0 @@ -import math -import logging - -logger = logging.getLogger(__name__) - -import numpy as np -import torch -from torch import nn -from torch.nn import Conv1d, Conv2d, ConvTranspose1d -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm - -from lib.infer_libs.infer_pack import attentions, commons, modules -from lib.infer_libs.infer_pack.commons import get_padding, init_weights -has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available()) - -class TextEncoder256(nn.Module): - def __init__( - self, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=True, - ): - super().__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.emb_phone = nn.Linear(256, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone, pitch, lengths): - if pitch == None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x = self.lrelu(x) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return m, logs, x_mask - - -class TextEncoder768(nn.Module): - def __init__( - self, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=True, - ): - super().__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.emb_phone = nn.Linear(768, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone, pitch, lengths): - if pitch == None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x = self.lrelu(x) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return m, logs, x_mask - - -class ResidualCouplingBlock(nn.Module): - def __init__( - self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - n_flows=4, - gin_channels=0, - ): - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - - self.flows = nn.ModuleList() - for i in range(n_flows): - self.flows.append( - modules.ResidualCouplingLayer( - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=gin_channels, - mean_only=True, - ) - ) - self.flows.append(modules.Flip()) - - def forward(self, x, x_mask, g=None, reverse=False): - if not reverse: - for flow in self.flows: - x, _ = flow(x, x_mask, g=g, reverse=reverse) - else: - for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) - return x - - def remove_weight_norm(self): - for i in range(self.n_flows): - self.flows[i * 2].remove_weight_norm() - - -class PosteriorEncoder(nn.Module): - def __init__( - self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0, - ): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - - self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = modules.WN( - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=gin_channels, - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.pre(x) * x_mask - x = self.enc(x, x_mask, g=g) - stats = self.proj(x) * x_mask - m, logs = torch.split(stats, self.out_channels, dim=1) - z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask - return z, m, logs, x_mask - - def remove_weight_norm(self): - self.enc.remove_weight_norm() - - -class Generator(torch.nn.Module): - def __init__( - self, - initial_channel, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=0, - ): - super(Generator, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) - resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups.append( - weight_norm( - ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) - ) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): - self.resblocks.append(resblock(ch, k, d)) - - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, g=None): - x = self.conv_pre(x) - if g is not None: - x = x + self.cond(g) - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - - return x - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - - -class SineGen(torch.nn.Module): - """Definition of sine generator - SineGen(samp_rate, harmonic_num = 0, - sine_amp = 0.1, noise_std = 0.003, - voiced_threshold = 0, - flag_for_pulse=False) - samp_rate: sampling rate in Hz - harmonic_num: number of harmonic overtones (default 0) - sine_amp: amplitude of sine-wavefrom (default 0.1) - noise_std: std of Gaussian noise (default 0.003) - voiced_thoreshold: F0 threshold for U/V classification (default 0) - flag_for_pulse: this SinGen is used inside PulseGen (default False) - Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(np.pi) or cos(0) - """ - - def __init__( - self, - samp_rate, - harmonic_num=0, - sine_amp=0.1, - noise_std=0.003, - voiced_threshold=0, - flag_for_pulse=False, - ): - super(SineGen, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - - def _f02uv(self, f0): - # generate uv signal - uv = torch.ones_like(f0) - uv = uv * (f0 > self.voiced_threshold) - if uv.device.type == "privateuseone": # for DirectML - uv = uv.float() - return uv - - def forward(self, f0, upp): - """sine_tensor, uv = forward(f0) - input F0: tensor(batchsize=1, length, dim=1) - f0 for unvoiced steps should be 0 - output sine_tensor: tensor(batchsize=1, length, dim) - output uv: tensor(batchsize=1, length, 1) - """ - with torch.no_grad(): - f0 = f0[:, None].transpose(1, 2) - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) - # fundamental component - f0_buf[:, :, 0] = f0[:, :, 0] - for idx in np.arange(self.harmonic_num): - f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( - idx + 2 - ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic - rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 - rand_ini = torch.rand( - f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device - ) - rand_ini[:, 0] = 0 - rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 - tmp_over_one *= upp - tmp_over_one = F.interpolate( - tmp_over_one.transpose(2, 1), - scale_factor=upp, - mode="linear", - align_corners=True, - ).transpose(2, 1) - rad_values = F.interpolate( - rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" - ).transpose( - 2, 1 - ) ####### - tmp_over_one %= 1 - tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 - cumsum_shift = torch.zeros_like(rad_values) - cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - sine_waves = torch.sin( - torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi - ) - sine_waves = sine_waves * self.sine_amp - uv = self._f02uv(f0) - uv = F.interpolate( - uv.transpose(2, 1), scale_factor=upp, mode="nearest" - ).transpose(2, 1) - noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 - noise = noise_amp * torch.randn_like(sine_waves) - sine_waves = sine_waves * uv + noise - return sine_waves, uv, noise - - -class SourceModuleHnNSF(torch.nn.Module): - """SourceModule for hn-nsf - SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, - add_noise_std=0.003, voiced_threshod=0) - sampling_rate: sampling_rate in Hz - harmonic_num: number of harmonic above F0 (default: 0) - sine_amp: amplitude of sine source signal (default: 0.1) - add_noise_std: std of additive Gaussian noise (default: 0.003) - note that amplitude of noise in unvoiced is decided - by sine_amp - voiced_threshold: threhold to set U/V given F0 (default: 0) - Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) - F0_sampled (batchsize, length, 1) - Sine_source (batchsize, length, 1) - noise_source (batchsize, length 1) - uv (batchsize, length, 1) - """ - - def __init__( - self, - sampling_rate, - harmonic_num=0, - sine_amp=0.1, - add_noise_std=0.003, - voiced_threshod=0, - is_half=True, - ): - super(SourceModuleHnNSF, self).__init__() - - self.sine_amp = sine_amp - self.noise_std = add_noise_std - self.is_half = is_half - # to produce sine waveforms - self.l_sin_gen = SineGen( - sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod - ) - - # to merge source harmonics into a single excitation - self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) - self.l_tanh = torch.nn.Tanh() - - def forward(self, x, upp=None): - if hasattr(self, "ddtype") == False: - self.ddtype = self.l_linear.weight.dtype - sine_wavs, uv, _ = self.l_sin_gen(x, upp) - # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) - # if self.is_half: - # sine_wavs = sine_wavs.half() - # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) - # print(sine_wavs.dtype,self.ddtype) - if sine_wavs.dtype != self.ddtype: - sine_wavs = sine_wavs.to(self.ddtype) - sine_merge = self.l_tanh(self.l_linear(sine_wavs)) - return sine_merge, None, None # noise, uv - - -class GeneratorNSF(torch.nn.Module): - def __init__( - self, - initial_channel, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels, - sr, - is_half=False, - ): - super(GeneratorNSF, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - - self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) - self.m_source = SourceModuleHnNSF( - sampling_rate=sr, harmonic_num=0, is_half=is_half - ) - self.noise_convs = nn.ModuleList() - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) - resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - c_cur = upsample_initial_channel // (2 ** (i + 1)) - self.ups.append( - weight_norm( - ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) - ) - if i + 1 < len(upsample_rates): - stride_f0 = np.prod(upsample_rates[i + 1 :]) - self.noise_convs.append( - Conv1d( - 1, - c_cur, - kernel_size=stride_f0 * 2, - stride=stride_f0, - padding=stride_f0 // 2, - ) - ) - else: - self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): - self.resblocks.append(resblock(ch, k, d)) - - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - self.upp = np.prod(upsample_rates) - - def forward(self, x, f0, g=None): - har_source, noi_source, uv = self.m_source(f0, self.upp) - har_source = har_source.transpose(1, 2) - x = self.conv_pre(x) - if g is not None: - x = x + self.cond(g) - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - x_source = self.noise_convs[i](har_source) - x = x + x_source - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - return x - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - - -sr2sr = { - "32k": 32000, - "40k": 40000, - "48k": 48000, -} - - -class SynthesizerTrnMs256NSFsid(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - **kwargs - ): - super().__init__() - if type(sr) == type("strr"): - sr = sr2sr[sr] - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - ) - self.dec = GeneratorNSF( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - is_half=kwargs["is_half"], - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - logger.debug( - "gin_channels: " - + str(gin_channels) - + ", self.spk_embed_dim: " - + str(self.spk_embed_dim) - ) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward( - self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds - ): # 这里ds是id,[bs,1] - # print(1,pitch.shape)#[bs,t] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) - # print(-2,pitchf.shape,z_slice.shape) - o = self.dec(z_slice, pitchf, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate: - head = int(z_p.shape[2] * rate) - z_p = z_p[:, :, -head:] - x_mask = x_mask[:, :, -head:] - nsff0 = nsff0[:, -head:] - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, nsff0, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs768NSFsid(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - **kwargs - ): - super().__init__() - if type(sr) == type("strr"): - sr = sr2sr[sr] - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder768( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - ) - self.dec = GeneratorNSF( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - is_half=kwargs["is_half"], - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - logger.debug( - "gin_channels: " - + str(gin_channels) - + ", self.spk_embed_dim: " - + str(self.spk_embed_dim) - ) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward( - self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds - ): # 这里ds是id,[bs,1] - # print(1,pitch.shape)#[bs,t] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) - # print(-2,pitchf.shape,z_slice.shape) - o = self.dec(z_slice, pitchf, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate: - head = int(z_p.shape[2] * rate) - z_p = z_p[:, :, -head:] - x_mask = x_mask[:, :, -head:] - nsff0 = nsff0[:, -head:] - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, nsff0, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs256NSFsid_nono(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr=None, - **kwargs - ): - super().__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=False, - ) - self.dec = Generator( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - logger.debug( - "gin_channels: " - + str(gin_channels) - + ", self.spk_embed_dim: " - + str(self.spk_embed_dim) - ) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - o = self.dec(z_slice, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, phone, phone_lengths, sid, rate=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate: - head = int(z_p.shape[2] * rate) - z_p = z_p[:, :, -head:] - x_mask = x_mask[:, :, -head:] - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs768NSFsid_nono(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr=None, - **kwargs - ): - super().__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder768( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=False, - ) - self.dec = Generator( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - logger.debug( - "gin_channels: " - + str(gin_channels) - + ", self.spk_embed_dim: " - + str(self.spk_embed_dim) - ) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - o = self.dec(z_slice, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, phone, phone_lengths, sid, rate=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate: - head = int(z_p.shape[2] * rate) - z_p = z_p[:, :, -head:] - x_mask = x_mask[:, :, -head:] - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminator, self).__init__() - periods = [2, 3, 5, 7, 11, 17] - # periods = [3, 5, 7, 11, 17, 23, 37] - - discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat): - y_d_rs = [] # - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - # for j in range(len(fmap_r)): - # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class MultiPeriodDiscriminatorV2(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminatorV2, self).__init__() - # periods = [2, 3, 5, 7, 11, 17] - periods = [2, 3, 5, 7, 11, 17, 23, 37] - - discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat): - y_d_rs = [] # - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - # for j in range(len(fmap_r)): - # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class DiscriminatorS(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f(Conv1d(1, 16, 15, 1, padding=7)), - norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), - norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), - norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), - norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), - norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), - ] - ) - self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) - - def forward(self, x): - fmap = [] - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f( - Conv2d( - 1, - 32, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 32, - 128, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 128, - 512, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 512, - 1024, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 1024, - 1024, - (kernel_size, 1), - 1, - padding=(get_padding(kernel_size, 1), 0), - ) - ), - ] - ) - self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - - def forward(self, x): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - if has_xpu and x.dtype == torch.bfloat16: - x = F.pad(x.to(dtype=torch.float16), (0, n_pad), "reflect").to(dtype=torch.bfloat16) - else: - x = F.pad(x, (0, n_pad), "reflect") - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap diff --git a/lib/infer_libs/infer_pack/modules.py b/lib/infer_libs/infer_pack/modules.py deleted file mode 100644 index caae38b0..00000000 --- a/lib/infer_libs/infer_pack/modules.py +++ /dev/null @@ -1,517 +0,0 @@ -import math -import torch -from torch import nn -from torch.nn import Conv1d -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, weight_norm - -from lib.infer_libs.infer_pack import commons -from lib.infer_libs.infer_pack.commons import get_padding, init_weights -from lib.infer_libs.infer_pack.transforms import piecewise_rational_quadratic_transform - -LRELU_SLOPE = 0.1 - - -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-5): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - x = x.transpose(1, -1) - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) - - -class ConvReluNorm(nn.Module): - def __init__( - self, - in_channels, - hidden_channels, - out_channels, - kernel_size, - n_layers, - p_dropout, - ): - super().__init__() - self.in_channels = in_channels - self.hidden_channels = hidden_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - assert n_layers > 1, "Number of layers should be larger than 0." - - self.conv_layers = nn.ModuleList() - self.norm_layers = nn.ModuleList() - self.conv_layers.append( - nn.Conv1d( - in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 - ) - ) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) - for _ in range(n_layers - 1): - self.conv_layers.append( - nn.Conv1d( - hidden_channels, - hidden_channels, - kernel_size, - padding=kernel_size // 2, - ) - ) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.proj = nn.Conv1d(hidden_channels, out_channels, 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask): - x_org = x - for i in range(self.n_layers): - x = self.conv_layers[i](x * x_mask) - x = self.norm_layers[i](x) - x = self.relu_drop(x) - x = x_org + self.proj(x) - return x * x_mask - - -class DDSConv(nn.Module): - """ - Dialted and Depth-Separable Convolution - """ - - def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): - super().__init__() - self.channels = channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - - self.drop = nn.Dropout(p_dropout) - self.convs_sep = nn.ModuleList() - self.convs_1x1 = nn.ModuleList() - self.norms_1 = nn.ModuleList() - self.norms_2 = nn.ModuleList() - for i in range(n_layers): - dilation = kernel_size**i - padding = (kernel_size * dilation - dilation) // 2 - self.convs_sep.append( - nn.Conv1d( - channels, - channels, - kernel_size, - groups=channels, - dilation=dilation, - padding=padding, - ) - ) - self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) - self.norms_1.append(LayerNorm(channels)) - self.norms_2.append(LayerNorm(channels)) - - def forward(self, x, x_mask, g=None): - if g is not None: - x = x + g - for i in range(self.n_layers): - y = self.convs_sep[i](x * x_mask) - y = self.norms_1[i](y) - y = F.gelu(y) - y = self.convs_1x1[i](y) - y = self.norms_2[i](y) - y = F.gelu(y) - y = self.drop(y) - x = x + y - return x * x_mask - - -class WN(torch.nn.Module): - def __init__( - self, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0, - p_dropout=0, - ): - super(WN, self).__init__() - assert kernel_size % 2 == 1 - self.hidden_channels = hidden_channels - self.kernel_size = (kernel_size,) - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.p_dropout = p_dropout - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - - if gin_channels != 0: - cond_layer = torch.nn.Conv1d( - gin_channels, 2 * hidden_channels * n_layers, 1 - ) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") - - for i in range(n_layers): - dilation = dilation_rate**i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d( - hidden_channels, - 2 * hidden_channels, - kernel_size, - dilation=dilation, - padding=padding, - ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") - self.in_layers.append(in_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - if g is not None: - g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - if g is not None: - cond_offset = i * 2 * self.hidden_channels - g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] - else: - g_l = torch.zeros_like(x_in) - - acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) - acts = self.drop(acts) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:, : self.hidden_channels, :] - x = (x + res_acts) * x_mask - output = output + res_skip_acts[:, self.hidden_channels :, :] - else: - output = output + res_skip_acts - return output * x_mask - - def remove_weight_norm(self): - if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) - for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) - for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) - - -class ResBlock1(torch.nn.Module): - def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super(ResBlock1, self).__init__() - self.convs1 = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[2], - padding=get_padding(kernel_size, dilation[2]), - ) - ), - ] - ) - self.convs1.apply(init_weights) - - self.convs2 = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - ] - ) - self.convs2.apply(init_weights) - - def forward(self, x, x_mask=None): - for c1, c2 in zip(self.convs1, self.convs2): - xt = F.leaky_relu(x, LRELU_SLOPE) - if x_mask is not None: - xt = xt * x_mask - xt = c1(xt) - xt = F.leaky_relu(xt, LRELU_SLOPE) - if x_mask is not None: - xt = xt * x_mask - xt = c2(xt) - x = xt + x - if x_mask is not None: - x = x * x_mask - return x - - def remove_weight_norm(self): - for l in self.convs1: - remove_weight_norm(l) - for l in self.convs2: - remove_weight_norm(l) - - -class ResBlock2(torch.nn.Module): - def __init__(self, channels, kernel_size=3, dilation=(1, 3)): - super(ResBlock2, self).__init__() - self.convs = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - ] - ) - self.convs.apply(init_weights) - - def forward(self, x, x_mask=None): - for c in self.convs: - xt = F.leaky_relu(x, LRELU_SLOPE) - if x_mask is not None: - xt = xt * x_mask - xt = c(xt) - x = xt + x - if x_mask is not None: - x = x * x_mask - return x - - def remove_weight_norm(self): - for l in self.convs: - remove_weight_norm(l) - - -class Log(nn.Module): - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask - logdet = torch.sum(-y, [1, 2]) - return y, logdet - else: - x = torch.exp(x) * x_mask - return x - - -class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x - - -class ElementwiseAffine(nn.Module): - def __init__(self, channels): - super().__init__() - self.channels = channels - self.m = nn.Parameter(torch.zeros(channels, 1)) - self.logs = nn.Parameter(torch.zeros(channels, 1)) - - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = self.m + torch.exp(self.logs) * x - y = y * x_mask - logdet = torch.sum(self.logs * x_mask, [1, 2]) - return y, logdet - else: - x = (x - self.m) * torch.exp(-self.logs) * x_mask - return x - - -class ResidualCouplingLayer(nn.Module): - def __init__( - self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False, - ): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN( - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=p_dropout, - gin_channels=gin_channels, - ) - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - h = self.pre(x0) * x_mask - h = self.enc(h, x_mask, g=g) - stats = self.post(h) * x_mask - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels] * 2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: - x1 = m + x1 * torch.exp(logs) * x_mask - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1, 2]) - return x, logdet - else: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x - - def remove_weight_norm(self): - self.enc.remove_weight_norm() - - -class ConvFlow(nn.Module): - def __init__( - self, - in_channels, - filter_channels, - kernel_size, - n_layers, - num_bins=10, - tail_bound=5.0, - ): - super().__init__() - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.num_bins = num_bins - self.tail_bound = tail_bound - self.half_channels = in_channels // 2 - - self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) - self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) - self.proj = nn.Conv1d( - filter_channels, self.half_channels * (num_bins * 3 - 1), 1 - ) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - h = self.pre(x0) - h = self.convs(h, x_mask, g=g) - h = self.proj(h) * x_mask - - b, c, t = x0.shape - h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] - - unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( - self.filter_channels - ) - unnormalized_derivatives = h[..., 2 * self.num_bins :] - - x1, logabsdet = piecewise_rational_quadratic_transform( - x1, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=reverse, - tails="linear", - tail_bound=self.tail_bound, - ) - - x = torch.cat([x0, x1], 1) * x_mask - logdet = torch.sum(logabsdet * x_mask, [1, 2]) - if not reverse: - return x, logdet - else: - return x diff --git a/lib/infer_libs/infer_pack/transforms.py b/lib/infer_libs/infer_pack/transforms.py deleted file mode 100644 index 6f30b717..00000000 --- a/lib/infer_libs/infer_pack/transforms.py +++ /dev/null @@ -1,207 +0,0 @@ -import numpy as np -import torch -from torch.nn import functional as F - -DEFAULT_MIN_BIN_WIDTH = 1e-3 -DEFAULT_MIN_BIN_HEIGHT = 1e-3 -DEFAULT_MIN_DERIVATIVE = 1e-3 - - -def piecewise_rational_quadratic_transform( - inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails=None, - tail_bound=1.0, - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE, -): - if tails is None: - spline_fn = rational_quadratic_spline - spline_kwargs = {} - else: - spline_fn = unconstrained_rational_quadratic_spline - spline_kwargs = {"tails": tails, "tail_bound": tail_bound} - - outputs, logabsdet = spline_fn( - inputs=inputs, - unnormalized_widths=unnormalized_widths, - unnormalized_heights=unnormalized_heights, - unnormalized_derivatives=unnormalized_derivatives, - inverse=inverse, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - **spline_kwargs - ) - return outputs, logabsdet - - -def searchsorted(bin_locations, inputs, eps=1e-6): - bin_locations[..., -1] += eps - return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 - - -def unconstrained_rational_quadratic_spline( - inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails="linear", - tail_bound=1.0, - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE, -): - inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) - outside_interval_mask = ~inside_interval_mask - - outputs = torch.zeros_like(inputs) - logabsdet = torch.zeros_like(inputs) - - if tails == "linear": - unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) - constant = np.log(np.exp(1 - min_derivative) - 1) - unnormalized_derivatives[..., 0] = constant - unnormalized_derivatives[..., -1] = constant - - outputs[outside_interval_mask] = inputs[outside_interval_mask] - logabsdet[outside_interval_mask] = 0 - else: - raise RuntimeError("{} tails are not implemented.".format(tails)) - - ( - outputs[inside_interval_mask], - logabsdet[inside_interval_mask], - ) = rational_quadratic_spline( - inputs=inputs[inside_interval_mask], - unnormalized_widths=unnormalized_widths[inside_interval_mask, :], - unnormalized_heights=unnormalized_heights[inside_interval_mask, :], - unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], - inverse=inverse, - left=-tail_bound, - right=tail_bound, - bottom=-tail_bound, - top=tail_bound, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - ) - - return outputs, logabsdet - - -def rational_quadratic_spline( - inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - left=0.0, - right=1.0, - bottom=0.0, - top=1.0, - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE, -): - if torch.min(inputs) < left or torch.max(inputs) > right: - raise ValueError("Input to a transform is not within its domain") - - num_bins = unnormalized_widths.shape[-1] - - if min_bin_width * num_bins > 1.0: - raise ValueError("Minimal bin width too large for the number of bins") - if min_bin_height * num_bins > 1.0: - raise ValueError("Minimal bin height too large for the number of bins") - - widths = F.softmax(unnormalized_widths, dim=-1) - widths = min_bin_width + (1 - min_bin_width * num_bins) * widths - cumwidths = torch.cumsum(widths, dim=-1) - cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) - cumwidths = (right - left) * cumwidths + left - cumwidths[..., 0] = left - cumwidths[..., -1] = right - widths = cumwidths[..., 1:] - cumwidths[..., :-1] - - derivatives = min_derivative + F.softplus(unnormalized_derivatives) - - heights = F.softmax(unnormalized_heights, dim=-1) - heights = min_bin_height + (1 - min_bin_height * num_bins) * heights - cumheights = torch.cumsum(heights, dim=-1) - cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) - cumheights = (top - bottom) * cumheights + bottom - cumheights[..., 0] = bottom - cumheights[..., -1] = top - heights = cumheights[..., 1:] - cumheights[..., :-1] - - if inverse: - bin_idx = searchsorted(cumheights, inputs)[..., None] - else: - bin_idx = searchsorted(cumwidths, inputs)[..., None] - - input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] - input_bin_widths = widths.gather(-1, bin_idx)[..., 0] - - input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] - delta = heights / widths - input_delta = delta.gather(-1, bin_idx)[..., 0] - - input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] - input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] - - input_heights = heights.gather(-1, bin_idx)[..., 0] - - if inverse: - a = (inputs - input_cumheights) * ( - input_derivatives + input_derivatives_plus_one - 2 * input_delta - ) + input_heights * (input_delta - input_derivatives) - b = input_heights * input_derivatives - (inputs - input_cumheights) * ( - input_derivatives + input_derivatives_plus_one - 2 * input_delta - ) - c = -input_delta * (inputs - input_cumheights) - - discriminant = b.pow(2) - 4 * a * c - assert (discriminant >= 0).all() - - root = (2 * c) / (-b - torch.sqrt(discriminant)) - outputs = root * input_bin_widths + input_cumwidths - - theta_one_minus_theta = root * (1 - root) - denominator = input_delta + ( - (input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta - ) - derivative_numerator = input_delta.pow(2) * ( - input_derivatives_plus_one * root.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - root).pow(2) - ) - logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) - - return outputs, -logabsdet - else: - theta = (inputs - input_cumwidths) / input_bin_widths - theta_one_minus_theta = theta * (1 - theta) - - numerator = input_heights * ( - input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta - ) - denominator = input_delta + ( - (input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta - ) - outputs = input_cumheights + numerator / denominator - - derivative_numerator = input_delta.pow(2) * ( - input_derivatives_plus_one * theta.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - theta).pow(2) - ) - logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) - - return outputs, logabsdet diff --git a/lib/infer_libs/rmvpe.py b/lib/infer_libs/rmvpe.py deleted file mode 100644 index d0e591aa..00000000 --- a/lib/infer_libs/rmvpe.py +++ /dev/null @@ -1,705 +0,0 @@ -import os - -import numpy as np -import torch -try: - #Fix "Torch not compiled with CUDA enabled" - import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import - if torch.xpu.is_available(): - from lib.infer.modules.ipex import ipex_init - ipex_init() -except Exception: - pass -import torch.nn as nn -import torch.nn.functional as F -from librosa.util import normalize, pad_center, tiny -from scipy.signal import get_window - -import logging - -logger = logging.getLogger(__name__) - - -###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py -def window_sumsquare( - window, - n_frames, - hop_length=200, - win_length=800, - n_fft=800, - dtype=np.float32, - norm=None, -): - """ - # from librosa 0.6 - Compute the sum-square envelope of a window function at a given hop length. - This is used to estimate modulation effects induced by windowing - observations in short-time fourier transforms. - Parameters - ---------- - window : string, tuple, number, callable, or list-like - Window specification, as in `get_window` - n_frames : int > 0 - The number of analysis frames - hop_length : int > 0 - The number of samples to advance between frames - win_length : [optional] - The length of the window function. By default, this matches `n_fft`. - n_fft : int > 0 - The length of each analysis frame. - dtype : np.dtype - The data type of the output - Returns - ------- - wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` - The sum-squared envelope of the window function - """ - if win_length is None: - win_length = n_fft - - n = n_fft + hop_length * (n_frames - 1) - x = np.zeros(n, dtype=dtype) - - # Compute the squared window at the desired length - win_sq = get_window(window, win_length, fftbins=True) - win_sq = normalize(win_sq, norm=norm) ** 2 - win_sq = pad_center(win_sq, n_fft) - - # Fill the envelope - for i in range(n_frames): - sample = i * hop_length - x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] - return x - - -class STFT(torch.nn.Module): - def __init__( - self, filter_length=1024, hop_length=512, win_length=None, window="hann" - ): - """ - This module implements an STFT using 1D convolution and 1D transpose convolutions. - This is a bit tricky so there are some cases that probably won't work as working - out the same sizes before and after in all overlap add setups is tough. Right now, - this code should work with hop lengths that are half the filter length (50% overlap - between frames). - - Keyword Arguments: - filter_length {int} -- Length of filters used (default: {1024}) - hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512}) - win_length {[type]} -- Length of the window function applied to each frame (if not specified, it - equals the filter length). (default: {None}) - window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris) - (default: {'hann'}) - """ - super(STFT, self).__init__() - self.filter_length = filter_length - self.hop_length = hop_length - self.win_length = win_length if win_length else filter_length - self.window = window - self.forward_transform = None - self.pad_amount = int(self.filter_length / 2) - #scale = self.filter_length / self.hop_length - fourier_basis = np.fft.fft(np.eye(self.filter_length)) - - cutoff = int((self.filter_length / 2 + 1)) - fourier_basis = np.vstack( - [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] - ) - forward_basis = torch.FloatTensor(fourier_basis) - inverse_basis = torch.FloatTensor( - np.linalg.pinv(fourier_basis) - ) - - assert filter_length >= self.win_length - # get window and zero center pad it to filter_length - fft_window = get_window(window, self.win_length, fftbins=True) - fft_window = pad_center(fft_window, size=filter_length) - fft_window = torch.from_numpy(fft_window).float() - - # window the bases - forward_basis *= fft_window - inverse_basis = (inverse_basis.T * fft_window).T - - self.register_buffer("forward_basis", forward_basis.float()) - self.register_buffer("inverse_basis", inverse_basis.float()) - self.register_buffer("fft_window", fft_window.float()) - - def transform(self, input_data, return_phase=False): - """Take input data (audio) to STFT domain. - - Arguments: - input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) - - Returns: - magnitude {tensor} -- Magnitude of STFT with shape (num_batch, - num_frequencies, num_frames) - phase {tensor} -- Phase of STFT with shape (num_batch, - num_frequencies, num_frames) - """ - # num_batches = input_data.shape[0] - # num_samples = input_data.shape[-1] - - # self.num_samples = num_samples - - # similar to librosa, reflect-pad the input - # input_data = input_data.view(num_batches, 1, num_samples) - # print(1234,input_data.shape) - input_data = F.pad( - input_data, - (self.pad_amount, self.pad_amount), - mode="reflect", - ) - - forward_transform = input_data.unfold(1, self.filter_length, self.hop_length).permute(0, 2, 1) - forward_transform = torch.matmul(self.forward_basis, forward_transform) - - cutoff = int((self.filter_length / 2) + 1) - real_part = forward_transform[:, :cutoff, :] - imag_part = forward_transform[:, cutoff:, :] - - magnitude = torch.sqrt(real_part**2 + imag_part**2) - # phase = torch.atan2(imag_part.data, real_part.data) - - if return_phase: - phase = torch.atan2(imag_part.data, real_part.data) - return magnitude, phase - else: - return magnitude - - def inverse(self, magnitude, phase): - """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced - by the ```transform``` function. - - Arguments: - magnitude {tensor} -- Magnitude of STFT with shape (num_batch, - num_frequencies, num_frames) - phase {tensor} -- Phase of STFT with shape (num_batch, - num_frequencies, num_frames) - - Returns: - inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of - shape (num_batch, num_samples) - """ - cat = torch.cat( - [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 - ) - - fold = torch.nn.Fold( - output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length), - kernel_size=(1, self.filter_length), - stride=(1, self.hop_length)) - inverse_transform = torch.matmul(self.inverse_basis, cat) - inverse_transform = fold(inverse_transform)[:, 0, 0, self.pad_amount : -self.pad_amount] - window_square_sum = self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0) - window_square_sum = fold(window_square_sum)[:, 0, 0, self.pad_amount : -self.pad_amount] - inverse_transform /= window_square_sum - - return inverse_transform - - def forward(self, input_data): - """Take input data (audio) to STFT domain and then back to audio. - - Arguments: - input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) - - Returns: - reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of - shape (num_batch, num_samples) - """ - self.magnitude, self.phase = self.transform(input_data, return_phase=True) - reconstruction = self.inverse(self.magnitude, self.phase) - return reconstruction - - -from time import time as ttime - - -class BiGRU(nn.Module): - def __init__(self, input_features, hidden_features, num_layers): - super(BiGRU, self).__init__() - self.gru = nn.GRU( - input_features, - hidden_features, - num_layers=num_layers, - batch_first=True, - bidirectional=True, - ) - - def forward(self, x): - return self.gru(x)[0] - - -class ConvBlockRes(nn.Module): - def __init__(self, in_channels, out_channels, momentum=0.01): - super(ConvBlockRes, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - nn.Conv2d( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - ) - if in_channels != out_channels: - self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) - self.is_shortcut = True - else: - self.is_shortcut = False - - def forward(self, x): - if self.is_shortcut: - return self.conv(x) + self.shortcut(x) - else: - return self.conv(x) + x - - -class Encoder(nn.Module): - def __init__( - self, - in_channels, - in_size, - n_encoders, - kernel_size, - n_blocks, - out_channels=16, - momentum=0.01, - ): - super(Encoder, self).__init__() - self.n_encoders = n_encoders - self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) - self.layers = nn.ModuleList() - self.latent_channels = [] - for i in range(self.n_encoders): - self.layers.append( - ResEncoderBlock( - in_channels, out_channels, kernel_size, n_blocks, momentum=momentum - ) - ) - self.latent_channels.append([out_channels, in_size]) - in_channels = out_channels - out_channels *= 2 - in_size //= 2 - self.out_size = in_size - self.out_channel = out_channels - - def forward(self, x): - concat_tensors = [] - x = self.bn(x) - for i in range(self.n_encoders): - _, x = self.layers[i](x) - concat_tensors.append(_) - return x, concat_tensors - - -class ResEncoderBlock(nn.Module): - def __init__( - self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 - ): - super(ResEncoderBlock, self).__init__() - self.n_blocks = n_blocks - self.conv = nn.ModuleList() - self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) - for i in range(n_blocks - 1): - self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) - self.kernel_size = kernel_size - if self.kernel_size is not None: - self.pool = nn.AvgPool2d(kernel_size=kernel_size) - - def forward(self, x): - for i in range(self.n_blocks): - x = self.conv[i](x) - if self.kernel_size is not None: - return x, self.pool(x) - else: - return x - - -class Intermediate(nn.Module): # - def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): - super(Intermediate, self).__init__() - self.n_inters = n_inters - self.layers = nn.ModuleList() - self.layers.append( - ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) - ) - for i in range(self.n_inters - 1): - self.layers.append( - ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) - ) - - def forward(self, x): - for i in range(self.n_inters): - x = self.layers[i](x) - return x - - -class ResDecoderBlock(nn.Module): - def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): - super(ResDecoderBlock, self).__init__() - out_padding = (0, 1) if stride == (1, 2) else (1, 1) - self.n_blocks = n_blocks - self.conv1 = nn.Sequential( - nn.ConvTranspose2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=stride, - padding=(1, 1), - output_padding=out_padding, - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - ) - self.conv2 = nn.ModuleList() - self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) - for i in range(n_blocks - 1): - self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) - - def forward(self, x, concat_tensor): - x = self.conv1(x) - x = torch.cat((x, concat_tensor), dim=1) - for i in range(self.n_blocks): - x = self.conv2[i](x) - return x - - -class Decoder(nn.Module): - def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): - super(Decoder, self).__init__() - self.layers = nn.ModuleList() - self.n_decoders = n_decoders - for i in range(self.n_decoders): - out_channels = in_channels // 2 - self.layers.append( - ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) - ) - in_channels = out_channels - - def forward(self, x, concat_tensors): - for i in range(self.n_decoders): - x = self.layers[i](x, concat_tensors[-1 - i]) - return x - - -class DeepUnet(nn.Module): - def __init__( - self, - kernel_size, - n_blocks, - en_de_layers=5, - inter_layers=4, - in_channels=1, - en_out_channels=16, - ): - super(DeepUnet, self).__init__() - self.encoder = Encoder( - in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels - ) - self.intermediate = Intermediate( - self.encoder.out_channel // 2, - self.encoder.out_channel, - inter_layers, - n_blocks, - ) - self.decoder = Decoder( - self.encoder.out_channel, en_de_layers, kernel_size, n_blocks - ) - - def forward(self, x): - x, concat_tensors = self.encoder(x) - x = self.intermediate(x) - x = self.decoder(x, concat_tensors) - return x - - -class E2E(nn.Module): - def __init__( - self, - n_blocks, - n_gru, - kernel_size, - en_de_layers=5, - inter_layers=4, - in_channels=1, - en_out_channels=16, - ): - super(E2E, self).__init__() - self.unet = DeepUnet( - kernel_size, - n_blocks, - en_de_layers, - inter_layers, - in_channels, - en_out_channels, - ) - self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) - if n_gru: - self.fc = nn.Sequential( - BiGRU(3 * 128, 256, n_gru), - nn.Linear(512, 360), - nn.Dropout(0.25), - nn.Sigmoid(), - ) - else: - self.fc = nn.Sequential( - nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid() - ) - - def forward(self, mel): - # print(mel.shape) - mel = mel.transpose(-1, -2).unsqueeze(1) - x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) - x = self.fc(x) - # print(x.shape) - return x - - -from librosa.filters import mel - - -class MelSpectrogram(torch.nn.Module): - def __init__( - self, - is_half, - n_mel_channels, - sampling_rate, - win_length, - hop_length, - n_fft=None, - mel_fmin=0, - mel_fmax=None, - clamp=1e-5, - ): - super().__init__() - n_fft = win_length if n_fft is None else n_fft - self.hann_window = {} - mel_basis = mel( - sr=sampling_rate, - n_fft=n_fft, - n_mels=n_mel_channels, - fmin=mel_fmin, - fmax=mel_fmax, - htk=True, - ) - mel_basis = torch.from_numpy(mel_basis).float() - self.register_buffer("mel_basis", mel_basis) - self.n_fft = win_length if n_fft is None else n_fft - self.hop_length = hop_length - self.win_length = win_length - self.sampling_rate = sampling_rate - self.n_mel_channels = n_mel_channels - self.clamp = clamp - self.is_half = is_half - - def forward(self, audio, keyshift=0, speed=1, center=True): - factor = 2 ** (keyshift / 12) - n_fft_new = int(np.round(self.n_fft * factor)) - win_length_new = int(np.round(self.win_length * factor)) - hop_length_new = int(np.round(self.hop_length * speed)) - keyshift_key = str(keyshift) + "_" + str(audio.device) - if keyshift_key not in self.hann_window: - self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( - # "cpu"if(audio.device.type=="privateuseone") else audio.device - audio.device - ) - if "privateuseone" in str(audio.device): - if not hasattr(self, "stft"): - self.stft = STFT( - filter_length=n_fft_new, - hop_length=hop_length_new, - win_length=win_length_new, - window="hann", - ).to(audio.device) - magnitude = self.stft.transform(audio) - else: - fft = torch.stft( - audio, - n_fft=n_fft_new, - hop_length=hop_length_new, - win_length=win_length_new, - window=self.hann_window[keyshift_key], - center=center, - return_complex=True, - ) - magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) - # if (audio.device.type == "privateuseone"): - # magnitude=magnitude.to(audio.device) - if keyshift != 0: - size = self.n_fft // 2 + 1 - resize = magnitude.size(1) - if resize < size: - magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) - magnitude = magnitude[:, :size, :] * self.win_length / win_length_new - mel_output = torch.matmul(self.mel_basis, magnitude) - if self.is_half == True: - mel_output = mel_output.half() - log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) - # print(log_mel_spec.device.type) - return log_mel_spec - - -class RMVPE: - def __init__(self, model_path, is_half, device=None): - self.resample_kernel = {} - self.resample_kernel = {} - self.is_half = is_half - if device is None: - device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - self.mel_extractor = MelSpectrogram( - is_half, 128, 16000, 1024, 160, None, 30, 8000 - ).to(device) - if "privateuseone" in str(device): - import onnxruntime as ort - - ort_session = ort.InferenceSession( - "%s/rmvpe.onnx" % os.environ["rmvpe_root"], - providers=["DmlExecutionProvider"], - ) - self.model = ort_session - else: - model = E2E(4, 1, (2, 2)) - ckpt = torch.load(model_path, map_location="cpu") - model.load_state_dict(ckpt) - model.eval() - if is_half == True: - model = model.half() - self.model = model - self.model = self.model.to(device) - cents_mapping = 20 * np.arange(360) + 1997.3794084376191 - self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 - - def mel2hidden(self, mel): - with torch.no_grad(): - n_frames = mel.shape[-1] - n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames - if n_pad > 0: - mel = F.pad( - mel, (0, n_pad), mode="constant" - ) - if "privateuseone" in str(self.device): - onnx_input_name = self.model.get_inputs()[0].name - onnx_outputs_names = self.model.get_outputs()[0].name - hidden = self.model.run( - [onnx_outputs_names], - input_feed={onnx_input_name: mel.cpu().numpy()}, - )[0] - else: - hidden = self.model(mel) - return hidden[:, :n_frames] - - def decode(self, hidden, thred=0.03): - cents_pred = self.to_local_average_cents(hidden, thred=thred) - f0 = 10 * (2 ** (cents_pred / 1200)) - f0[f0 == 10] = 0 - # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) - return f0 - - def infer_from_audio(self, audio, thred=0.03): - # torch.cuda.synchronize() - t0 = ttime() - mel = self.mel_extractor( - torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True - ) - # print(123123123,mel.device.type) - # torch.cuda.synchronize() - t1 = ttime() - hidden = self.mel2hidden(mel) - # torch.cuda.synchronize() - t2 = ttime() - # print(234234,hidden.device.type) - if "privateuseone" not in str(self.device): - hidden = hidden.squeeze(0).cpu().numpy() - else: - hidden = hidden[0] - if self.is_half == True: - hidden = hidden.astype("float32") - - f0 = self.decode(hidden, thred=thred) - # torch.cuda.synchronize() - t3 = ttime() - # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) - return f0 - - def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100): - t0 = ttime() - audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) - mel = self.mel_extractor(audio, center=True) - t1 = ttime() - hidden = self.mel2hidden(mel) - t2 = ttime() - if "privateuseone" not in str(self.device): - hidden = hidden.squeeze(0).cpu().numpy() - else: - hidden = hidden[0] - if self.is_half == True: - hidden = hidden.astype("float32") - f0 = self.decode(hidden, thred=thred) - f0[(f0 < f0_min) | (f0 > f0_max)] = 0 - t3 = ttime() - return f0 - - def to_local_average_cents(self, salience, thred=0.05): - # t0 = ttime() - center = np.argmax(salience, axis=1) # 帧长#index - salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 - # t1 = ttime() - center += 4 - todo_salience = [] - todo_cents_mapping = [] - starts = center - 4 - ends = center + 5 - for idx in range(salience.shape[0]): - todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) - todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) - # t2 = ttime() - todo_salience = np.array(todo_salience) # 帧长,9 - todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 - product_sum = np.sum(todo_salience * todo_cents_mapping, 1) - weight_sum = np.sum(todo_salience, 1) # 帧长 - devided = product_sum / weight_sum # 帧长 - # t3 = ttime() - maxx = np.max(salience, axis=1) # 帧长 - devided[maxx <= thred] = 0 - # t4 = ttime() - # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) - return devided - - -if __name__ == "__main__": - import librosa - import soundfile as sf - - audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav") - if len(audio.shape) > 1: - audio = librosa.to_mono(audio.transpose(1, 0)) - audio_bak = audio.copy() - if sampling_rate != 16000: - audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) - model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt" - thred = 0.03 # 0.01 - device = "cuda" if torch.cuda.is_available() else "cpu" - rmvpe = RMVPE(model_path, is_half=False, device=device) - t0 = ttime() - f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - t1 = ttime() - logger.info("%s %.2f", f0.shape, t1 - t0) diff --git a/lib/modules.py b/lib/modules.py deleted file mode 100644 index 03911303..00000000 --- a/lib/modules.py +++ /dev/null @@ -1,559 +0,0 @@ -import os, sys -import traceback -import logging -now_dir = os.getcwd() -sys.path.append(now_dir) -logger = logging.getLogger(__name__) -import numpy as np -import soundfile as sf -import torch -from io import BytesIO -from lib.infer_libs.audio import load_audio -from lib.infer_libs.audio import wav2 -from lib.infer_libs.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) -from lib.pipeline import Pipeline -import time -import glob -from shutil import move - -sup_audioext = { - "wav", - "mp3", - "flac", - "ogg", - "opus", - "m4a", - "mp4", - "aac", - "alac", - "wma", - "aiff", - "webm", - "ac3", -} - -def note_to_hz(note_name): - try: - SEMITONES = {'C': -9, 'C#': -8, 'D': -7, 'D#': -6, 'E': -5, 'F': -4, 'F#': -3, 'G': -2, 'G#': -1, 'A': 0, 'A#': 1, 'B': 2} - pitch_class, octave = note_name[:-1], int(note_name[-1]) - semitone = SEMITONES[pitch_class] - note_number = 12 * (octave - 4) + semitone - frequency = 440.0 * (2.0 ** (1.0/12)) ** note_number - return frequency - except: - return None - -def load_hubert(hubert_model_path="assets/hubert/hubert_base.pt"): - from fairseq import checkpoint_utils - - models, _, _ = checkpoint_utils.load_model_ensemble_and_task( - [hubert_model_path], - suffix="", - ) - hubert_model = models[0] - #hubert_model = hubert_model.to(config.device) - hubert_model = hubert_model.float() - - hubert_models = hubert_model.eval() - return hubert_models - -class VC: - def __init__(self, config): - self.n_spk = None - self.tgt_sr = None - self.net_g = None - self.pipeline = None - self.cpt = None - self.version = None - self.if_f0 = None - self.version = None - self.hubert_model = None - - self.config = config - - def get_vc(self, sid, *to_return_protect): - logger.info("Get sid: " + sid) - - to_return_protect0 = { - "visible": self.if_f0 != 0, - "value": to_return_protect[0] - if self.if_f0 != 0 and to_return_protect - else 0.5, - "__type__": "update", - } - to_return_protect1 = { - "visible": self.if_f0 != 0, - "value": to_return_protect[1] - if self.if_f0 != 0 and to_return_protect - else 0.33, - "__type__": "update", - } - - if sid == "" or sid == []: - if self.hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 - logger.info("Clean model cache") - del ( - self.net_g, - self.n_spk, - self.vc, - self.hubert_model, - self.tgt_sr, - ) # ,cpt - self.hubert_model = ( - self.net_g - ) = self.n_spk = self.vc = self.hubert_model = self.tgt_sr = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() - ###楼下不这么折腾清理不干净 - self.if_f0 = self.cpt.get("f0", 1) - self.version = self.cpt.get("version", "v1") - if self.version == "v1": - if self.if_f0 == 1: - self.net_g = SynthesizerTrnMs256NSFsid( - *self.cpt["config"], is_half=self.config.is_half - ) - else: - self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"]) - elif self.version == "v2": - if self.if_f0 == 1: - self.net_g = SynthesizerTrnMs768NSFsid( - *self.cpt["config"], is_half=self.config.is_half - ) - else: - self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"]) - del self.net_g, self.cpt - if torch.cuda.is_available(): - torch.cuda.empty_cache() - return ( - {"visible": False, "__type__": "update"}, - { - "visible": True, - "value": to_return_protect0, - "__type__": "update", - }, - { - "visible": True, - "value": to_return_protect1, - "__type__": "update", - }, - "", - "", - ) - #person = f'{os.getenv("weight_root")}/{sid}' - person = f'{sid}' - #logger.info(f"Loading: {person}") - logger.info(f"Loading...") - self.cpt = torch.load(person, map_location="cpu") - self.tgt_sr = self.cpt["config"][-1] - self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk - self.if_f0 = self.cpt.get("f0", 1) - self.version = self.cpt.get("version", "v1") - - synthesizer_class = { - ("v1", 1): SynthesizerTrnMs256NSFsid, - ("v1", 0): SynthesizerTrnMs256NSFsid_nono, - ("v2", 1): SynthesizerTrnMs768NSFsid, - ("v2", 0): SynthesizerTrnMs768NSFsid_nono, - } - - self.net_g = synthesizer_class.get( - (self.version, self.if_f0), SynthesizerTrnMs256NSFsid - )(*self.cpt["config"], is_half=self.config.is_half) - - del self.net_g.enc_q - - self.net_g.load_state_dict(self.cpt["weight"], strict=False) - self.net_g.eval().to(self.config.device) - if self.config.is_half: - self.net_g = self.net_g.half() - else: - self.net_g = self.net_g.float() - - self.pipeline = Pipeline(self.tgt_sr, self.config) - n_spk = self.cpt["config"][-3] - #index = {"value": get_index_path_from_model(sid), "__type__": "update"} - #logger.info("Select index: " + index["value"]) - - return ( - ( - {"visible": False, "maximum": n_spk, "__type__": "update"}, - to_return_protect0, - to_return_protect1 - ) - if to_return_protect - else {"visible": False, "maximum": n_spk, "__type__": "update"} - ) - - def vc_single_dont_save( - self, - sid, - input_audio_path1, - f0_up_key, - f0_method, - file_index, - file_index2, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, - crepe_hop_length, - do_formant, - quefrency, - timbre, - f0_min, - f0_max, - f0_autotune, - hubert_model_path = "assets/hubert/hubert_base.pt" - ): - """ - Performs inference without saving - - Parameters: - - sid (int) - - input_audio_path1 (str) - - f0_up_key (int) - - f0_method (str) - - file_index (str) - - file_index2 (str) - - index_rate (float) - - filter_radius (int) - - resample_sr (int) - - rms_mix_rate (float) - - protect (float) - - crepe_hop_length (int) - - do_formant (bool) - - quefrency (float) - - timbre (float) - - f0_min (str) - - f0_max (str) - - f0_autotune (bool) - - hubert_model_path (str) - - Returns: - Tuple(Tuple(status, index_info, times), Tuple(sr, data)): - - Tuple(status, index_info, times): - - status (str): either "Success." or an error - - index_info (str): index path if used - - times (list): [npy_time, f0_time, infer_time, total_time] - - Tuple(sr, data): Audio data results. - """ - global total_time - total_time = 0 - start_time = time.time() - - if not input_audio_path1: - return "You need to upload an audio", None - - if not os.path.exists(input_audio_path1): - return "Audio was not properly selected or doesn't exist", None - - f0_up_key = int(f0_up_key) - if not f0_min.isdigit(): - f0_min = note_to_hz(f0_min) - if f0_min: - print(f"Converted Min pitch: freq - {f0_min}") - else: - f0_min = 50 - print("Invalid minimum pitch note. Defaulting to 50hz.") - else: - f0_min = float(f0_min) - if not f0_max.isdigit(): - f0_max = note_to_hz(f0_max) - if f0_max: - print(f"Converted Max pitch: freq - {f0_max}") - else: - f0_max = 1100 - print("Invalid maximum pitch note. Defaulting to 1100hz.") - else: - f0_max = float(f0_max) - - try: - print(f"Attempting to load {input_audio_path1}....") - audio = load_audio(file=input_audio_path1, - sr=16000, - DoFormant=do_formant, - Quefrency=quefrency, - Timbre=timbre) - - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: - audio /= audio_max - times = [0, 0, 0] - - if self.hubert_model is None: - self.hubert_model = load_hubert(hubert_model_path, self.config) - - #try: - # self.if_f0 = self.cpt.get("f0", 1) - except NameError: - message = "Model was not properly selected" - print(message) - return message, None - - if file_index and not file_index == "" and isinstance(file_index, str): - file_index = file_index.strip(" ") \ - .strip('"') \ - .strip("\n") \ - .strip('"') \ - .strip(" ") \ - .replace("trained", "added") - elif file_index2: - file_index = file_index2 - else: - file_index = "" - - audio_opt = self.pipeline.pipeline( - self.hubert_model, - self.net_g, - sid, - audio, - input_audio_path1, - times, - f0_up_key, - f0_method, - file_index, - index_rate, - self.if_f0, - filter_radius, - self.tgt_sr, - resample_sr, - rms_mix_rate, - self.version, - protect, - crepe_hop_length, - f0_autotune, - f0_min=f0_min, - f0_max=f0_max - ) - - if self.tgt_sr != resample_sr >= 16000: - tgt_sr = resample_sr - else: - tgt_sr = self.tgt_sr - index_info = ( - "Index: %s." % file_index - if isinstance(file_index, str) and os.path.exists(file_index) - else "Index not used." - ) - end_time = time.time() - total_time = end_time - start_time - times.append(total_time) - return ( - ("Success.", index_info, times), - (tgt_sr, audio_opt), - ) - except: - info = traceback.format_exc() - logger.warn(info) - return ( - (info, None, [None, None, None, None]), - (None, None) - ) - - def vc_single( - self, - sid, - input_audio_path1, - f0_up_key, - f0_method, - file_index, - file_index2, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, - format1, - crepe_hop_length, - do_formant, - quefrency, - timbre, - f0_min, - f0_max, - f0_autotune, - hubert_model_path = "assets/hubert/hubert_base.pt" - ): - """ - Performs inference with saving - - Parameters: - - sid (int) - - input_audio_path1 (str) - - f0_up_key (int) - - f0_method (str) - - file_index (str) - - file_index2 (str) - - index_rate (float) - - filter_radius (int) - - resample_sr (int) - - rms_mix_rate (float) - - protect (float) - - format1 (str) - - crepe_hop_length (int) - - do_formant (bool) - - quefrency (float) - - timbre (float) - - f0_min (str) - - f0_max (str) - - f0_autotune (bool) - - hubert_model_path (str) - - Returns: - Tuple(Tuple(status, index_info, times), Tuple(sr, data), output_path): - - Tuple(status, index_info, times): - - status (str): either "Success." or an error - - index_info (str): index path if used - - times (list): [npy_time, f0_time, infer_time, total_time] - - Tuple(sr, data): Audio data results. - - output_path (str): Audio results path - """ - global total_time - total_time = 0 - start_time = time.time() - - if not input_audio_path1: - return "You need to upload an audio", None, None - - if not os.path.exists(input_audio_path1): - return "Audio was not properly selected or doesn't exist", None, None - - f0_up_key = int(f0_up_key) - if not f0_min.isdigit(): - f0_min = note_to_hz(f0_min) - if f0_min: - print(f"Converted Min pitch: freq - {f0_min}") - else: - f0_min = 50 - print("Invalid minimum pitch note. Defaulting to 50hz.") - else: - f0_min = float(f0_min) - if not f0_max.isdigit(): - f0_max = note_to_hz(f0_max) - if f0_max: - print(f"Converted Max pitch: freq - {f0_max}") - else: - f0_max = 1100 - print("Invalid maximum pitch note. Defaulting to 1100hz.") - else: - f0_max = float(f0_max) - - try: - print(f"Attempting to load {input_audio_path1}...") - audio = load_audio(file=input_audio_path1, - sr=16000, - DoFormant=do_formant, - Quefrency=quefrency, - Timbre=timbre) - - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: - audio /= audio_max - times = [0, 0, 0] - - if self.hubert_model is None: - self.hubert_model = load_hubert(hubert_model_path) - - #try: - # self.if_f0 = self.cpt.get() #"f0" - except NameError: - message = "Model was not properly selected" - print(message) - return message, None - if file_index and not file_index == "" and isinstance(file_index, str): - file_index = file_index.strip(" ") \ - .strip('"') \ - .strip("\n") \ - .strip('"') \ - .strip(" ") \ - .replace("trained", "added") - elif file_index2: - file_index = file_index2 - else: - file_index = "" - - audio_opt = self.pipeline.pipeline( - self.hubert_model, - self.net_g, - sid, - audio, - input_audio_path1, - times, - f0_up_key, - f0_method, - file_index, - index_rate, - self.if_f0, - filter_radius, - self.tgt_sr, - resample_sr, - rms_mix_rate, - self.version, - protect, - crepe_hop_length, - f0_autotune, - f0_min=f0_min, - f0_max=f0_max - ) - - if self.tgt_sr != resample_sr >= 16000: - tgt_sr = resample_sr - else: - tgt_sr = self.tgt_sr - index_info = ( - "Index: %s." % file_index - if isinstance(file_index, str) and os.path.exists(file_index) - else "Index not used." - ) - - opt_root = os.path.join(os.getcwd(), "output") - os.makedirs(opt_root, exist_ok=True) - output_count = 1 - - while True: - opt_filename = f"{os.path.splitext(os.path.basename(input_audio_path1))[0]}{os.path.basename(os.path.dirname(file_index))}{f0_method.capitalize()}_{output_count}.{format1}" - current_output_path = os.path.join(opt_root, opt_filename) - if not os.path.exists(current_output_path): - break - output_count += 1 - try: - if format1 in ["wav", "flac"]: - sf.write( - current_output_path, - audio_opt, - self.tgt_sr, - ) - else: - with BytesIO() as wavf: - sf.write( - wavf, - audio_opt, - self.tgt_sr, - format="wav" - ) - wavf.seek(0, 0) - with open(current_output_path, "wb") as outf: - wav2(wavf, outf, format1) - except: - info = traceback.format_exc() - end_time = time.time() - total_time = end_time - start_time - times.append(total_time) - return ( - ("Success.", index_info, times), - (tgt_sr, audio_opt), - current_output_path - ) - except: - info = traceback.format_exc() - logger.warn(info) - return ( - (info, None, [None, None, None, None]), - (None, None), - None - ) diff --git a/lib/pipeline.py b/lib/pipeline.py deleted file mode 100644 index 9a5df716..00000000 --- a/lib/pipeline.py +++ /dev/null @@ -1,773 +0,0 @@ -import os -import sys -import gc -import traceback -import logging - -logger = logging.getLogger(__name__) - -from functools import lru_cache -from time import time as ttime -from torch import Tensor -import faiss -import librosa -import numpy as np -import parselmouth -import pyworld -import torch.nn.functional as F -from scipy import signal -from tqdm import tqdm - -import random -now_dir = os.getcwd() -sys.path.append(now_dir) -import re -from functools import partial -bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) - -input_audio_path2wav = {} -import torchcrepe # Fork Feature. Crepe algo for training and preprocess -from torchfcpe import spawn_bundled_infer_model -import torch -from lib.infer_libs.rmvpe import RMVPE -from lib.infer_libs.fcpe import FCPE - -@lru_cache -def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): - audio = input_audio_path2wav[input_audio_path] - f0, t = pyworld.harvest( - audio, - fs=fs, - f0_ceil=f0max, - f0_floor=f0min, - frame_period=frame_period, - ) - f0 = pyworld.stonemask(audio, f0, t, fs) - return f0 - - -def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 - # print(data1.max(),data2.max()) - rms1 = librosa.feature.rms( - y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 - ) # 每半秒一个点 - rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) - rms1 = torch.from_numpy(rms1) - rms1 = F.interpolate( - rms1.unsqueeze(0), size=data2.shape[0], mode="linear" - ).squeeze() - rms2 = torch.from_numpy(rms2) - rms2 = F.interpolate( - rms2.unsqueeze(0), size=data2.shape[0], mode="linear" - ).squeeze() - rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) - data2 *= ( - torch.pow(rms1, torch.tensor(1 - rate)) - * torch.pow(rms2, torch.tensor(rate - 1)) - ).numpy() - return data2 - - -class Pipeline(object): - def __init__(self, tgt_sr, config): - self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( - config.x_pad, - config.x_query, - config.x_center, - config.x_max, - config.is_half, - ) - self.sr = 16000 # hubert输入采样率 - self.window = 160 # 每帧点数 - self.t_pad = self.sr * self.x_pad # 每条前后pad时间 - self.t_pad_tgt = tgt_sr * self.x_pad - self.t_pad2 = self.t_pad * 2 - self.t_query = self.sr * self.x_query # 查询切点前后查询时间 - self.t_center = self.sr * self.x_center # 查询切点位置 - self.t_max = self.sr * self.x_max # 免查询时长阈值 - self.device = config.device - self.model_rmvpe = RMVPE(os.environ["rmvpe_model_path"], is_half=self.is_half, device=self.device) - - self.note_dict = [ - 65.41, 69.30, 73.42, 77.78, 82.41, 87.31, - 92.50, 98.00, 103.83, 110.00, 116.54, 123.47, - 130.81, 138.59, 146.83, 155.56, 164.81, 174.61, - 185.00, 196.00, 207.65, 220.00, 233.08, 246.94, - 261.63, 277.18, 293.66, 311.13, 329.63, 349.23, - 369.99, 392.00, 415.30, 440.00, 466.16, 493.88, - 523.25, 554.37, 587.33, 622.25, 659.25, 698.46, - 739.99, 783.99, 830.61, 880.00, 932.33, 987.77, - 1046.50, 1108.73, 1174.66, 1244.51, 1318.51, 1396.91, - 1479.98, 1567.98, 1661.22, 1760.00, 1864.66, 1975.53, - 2093.00, 2217.46, 2349.32, 2489.02, 2637.02, 2793.83, - 2959.96, 3135.96, 3322.44, 3520.00, 3729.31, 3951.07 - ] - - # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device) - def get_optimal_torch_device(self, index: int = 0) -> torch.device: - if torch.cuda.is_available(): - return torch.device( - f"cuda:{index % torch.cuda.device_count()}" - ) # Very fast - elif torch.backends.mps.is_available(): - return torch.device("mps") - return torch.device("cpu") - - # Fork Feature: Compute f0 with the crepe method - def get_f0_crepe_computation( - self, - x, - f0_min, - f0_max, - p_len, - *args, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time. - **kwargs, # Either use crepe-tiny "tiny" or crepe "full". Default is full - ): - x = x.astype( - np.float32 - ) # fixes the F.conv2D exception. We needed to convert double to float. - x /= np.quantile(np.abs(x), 0.999) - torch_device = self.get_optimal_torch_device() - audio = torch.from_numpy(x).to(torch_device, copy=True) - audio = torch.unsqueeze(audio, dim=0) - if audio.ndim == 2 and audio.shape[0] > 1: - audio = torch.mean(audio, dim=0, keepdim=True).detach() - audio = audio.detach() - hop_length = kwargs.get('crepe_hop_length', 160) - model = kwargs.get('model', 'full') - print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) - pitch: Tensor = torchcrepe.predict( - audio, - self.sr, - hop_length, - f0_min, - f0_max, - model, - batch_size=hop_length * 2, - device=torch_device, - pad=True, - ) - p_len = p_len or x.shape[0] // hop_length - # Resize the pitch for final f0 - source = np.array(pitch.squeeze(0).cpu().float().numpy()) - source[source < 0.001] = np.nan - target = np.interp( - np.arange(0, len(source) * p_len, len(source)) / p_len, - np.arange(0, len(source)), - source, - ) - f0 = np.nan_to_num(target) - return f0 # Resized f0 - - def get_f0_official_crepe_computation( - self, - x, - f0_min, - f0_max, - *args, - **kwargs - ): - # Pick a batch size that doesn't cause memory errors on your gpu - batch_size = 512 - # Compute pitch using first gpu - audio = torch.tensor(np.copy(x))[None].float() - model = kwargs.get('model', 'full') - f0, pd = torchcrepe.predict( - audio, - self.sr, - self.window, - f0_min, - f0_max, - model, - batch_size=batch_size, - device=self.device, - return_periodicity=True, - ) - pd = torchcrepe.filter.median(pd, 3) - f0 = torchcrepe.filter.mean(f0, 3) - f0[pd < 0.1] = 0 - f0 = f0[0].cpu().numpy() - return f0 - - # Fork Feature: Compute pYIN f0 method - def get_f0_pyin_computation(self, x, f0_min, f0_max): - y, sr = librosa.load(x, sr=self.sr, mono=True) - f0, _, _ = librosa.pyin(y, fmin=f0_min, fmax=f0_max, sr=self.sr) - f0 = f0[1:] # Get rid of extra first frame - return f0 - - def get_rmvpe(self, x, *args, **kwargs): - if not hasattr(self, "model_rmvpe"): - from lib.infer.infer_libs.rmvpe import RMVPE - - logger.info( - f"Loading rmvpe model, {os.environ['rmvpe_model_path']}" - ) - self.model_rmvpe = RMVPE( - os.environ["rmvpe_model_path"], - is_half=self.is_half, - device=self.device, - ) - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) - - if "privateuseone" in str(self.device): # clean ortruntime memory - del self.model_rmvpe.model - del self.model_rmvpe - logger.info("Cleaning ortruntime memory") - - return f0 - - - def get_pitch_dependant_rmvpe(self, x, f0_min=1, f0_max=40000, *args, **kwargs): - if not hasattr(self, "model_rmvpe"): - from lib.infer.infer_libs.rmvpe import RMVPE - - logger.info( - f"Loading rmvpe model, {os.environ['rmvpe_model_path']}" - ) - self.model_rmvpe = RMVPE( - os.environ["rmvpe_model_path"], - is_half=self.is_half, - device=self.device, - ) - f0 = self.model_rmvpe.infer_from_audio_with_pitch(x, thred=0.03, f0_min=f0_min, f0_max=f0_max) - if "privateuseone" in str(self.device): # clean ortruntime memory - del self.model_rmvpe.model - del self.model_rmvpe - logger.info("Cleaning ortruntime memory") - - return f0 - - def get_fcpe(self, x, f0_min, f0_max, p_len, *args, **kwargs): - self.model_fcpe = FCPE(os.environ["fcpe_model_path"], f0_min=f0_min, f0_max=f0_max, dtype=torch.float32, device=self.device, sampling_rate=self.sr, threshold=0.03) - f0 = self.model_fcpe.compute_f0(x, p_len=p_len) - del self.model_fcpe - gc.collect() - return f0 - - def get_torchfcpe(self, x, sr, f0_min, f0_max, p_len, *args, **kwargs): - self.model_torchfcpe = spawn_bundled_infer_model(device=self.device) - f0 = self.model_torchfcpe.infer( - torch.from_numpy(x).float().unsqueeze(0).unsqueeze(-1).to(self.device), - sr=sr, - decoder_mode="local_argmax", - threshold=0.006, - f0_min=f0_min, - f0_max=f0_max, - output_interp_target_length=p_len - ) - return f0.squeeze().cpu().numpy() - - def autotune_f0(self, f0): - autotuned_f0 = [] - for freq in f0: - closest_notes = [x for x in self.note_dict if abs(x - freq) == min(abs(n - freq) for n in self.note_dict)] - autotuned_f0.append(random.choice(closest_notes)) - return np.array(autotuned_f0, np.float64) - - - # Fork Feature: Acquire median hybrid f0 estimation calculation - def get_f0_hybrid_computation( - self, - methods_str, - input_audio_path, - x, - f0_min, - f0_max, - p_len, - filter_radius, - crepe_hop_length, - time_step, - ): - # Get various f0 methods from input to use in the computation stack - methods_str = re.search('hybrid\[(.+)\]', methods_str) - if methods_str: # Ensure a match was found - methods = [method.strip() for method in methods_str.group(1).split('+')] - f0_computation_stack = [] - - print("Calculating f0 pitch estimations for methods: %s" % str(methods)) - x = x.astype(np.float32) - x /= np.quantile(np.abs(x), 0.999) - # Get f0 calculations for all methods specified - for method in methods: - f0 = None - if method == "pm": - f0 = ( - parselmouth.Sound(x, self.sr) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=f0_min, - pitch_ceiling=f0_max, - ) - .selected_array["frequency"] - ) - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad( - f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" - ) - elif method == "crepe": - f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, model="full") - f0 = f0[1:] - elif method == "crepe-tiny": - f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, model="tiny") - f0 = f0[1:] # Get rid of extra first frame - elif method == "mangio-crepe": - f0 = self.get_f0_crepe_computation( - x, f0_min, f0_max, p_len, crepe_hop_length=crepe_hop_length - ) - elif method == "mangio-crepe-tiny": - f0 = self.get_f0_crepe_computation( - x, f0_min, f0_max, p_len, crepe_hop_length=crepe_hop_length, model="tiny" - ) - elif method == "harvest": - input_audio_path2wav[input_audio_path] = x.astype(np.double) - f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) - if filter_radius > 2: - f0 = signal.medfilt(f0, 3) - elif method == "dio": - f0, t = pyworld.dio( - x.astype(np.double), - fs=self.sr, - f0_ceil=f0_max, - f0_floor=f0_min, - frame_period=10, - ) - f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) - f0 = signal.medfilt(f0, 3) - f0 = f0[1:] - elif method == "rmvpe": - f0 = self.get_rmvpe(x) - f0 = f0[1:] - elif method == "fcpe_legacy": - f0 = self.get_fcpe(x, f0_min=f0_min, f0_max=f0_max, p_len=p_len) - elif method == "fcpe": - f0 = self.get_torchfcpe(x, self.sr, f0_min, f0_max, p_len) - elif method == "pyin": - f0 = self.get_f0_pyin_computation(input_audio_path, f0_min, f0_max) - # Push method to the stack - f0_computation_stack.append(f0) - - for fc in f0_computation_stack: - print(len(fc)) - - print("Calculating hybrid median f0 from the stack of: %s" % str(methods)) - f0_median_hybrid = None - if len(f0_computation_stack) == 1: - f0_median_hybrid = f0_computation_stack[0] - else: - f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0) - return f0_median_hybrid - - def get_f0( - self, - input_audio_path, - x, - p_len, - f0_up_key, - f0_method, - filter_radius, - crepe_hop_length, - f0_autotune, - inp_f0=None, - f0_min=50, - f0_max=1100, - ): - global input_audio_path2wav - time_step = self.window / self.sr * 1000 - f0_min = f0_min - f0_max = f0_max - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) - - if f0_method == "pm": - f0 = ( - parselmouth.Sound(x, self.sr) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=f0_min, - pitch_ceiling=f0_max, - ) - .selected_array["frequency"] - ) - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad( - f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" - ) - elif f0_method == "harvest": - input_audio_path2wav[input_audio_path] = x.astype(np.double) - f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) - if filter_radius > 2: - f0 = signal.medfilt(f0, 3) - elif f0_method == "dio": # Potentially Buggy? - f0, t = pyworld.dio( - x.astype(np.double), - fs=self.sr, - f0_ceil=f0_max, - f0_floor=f0_min, - frame_period=10, - ) - f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) - f0 = signal.medfilt(f0, 3) - elif f0_method == "crepe": - model = "full" - # Pick a batch size that doesn't cause memory errors on your gpu - batch_size = 512 - # Compute pitch using first gpu - audio = torch.tensor(np.copy(x))[None].float() - f0, pd = torchcrepe.predict( - audio, - self.sr, - self.window, - f0_min, - f0_max, - model, - batch_size=batch_size, - device=self.device, - return_periodicity=True, - ) - pd = torchcrepe.filter.median(pd, 3) - f0 = torchcrepe.filter.mean(f0, 3) - f0[pd < 0.1] = 0 - f0 = f0[0].cpu().numpy() - elif f0_method == "crepe-tiny": - f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, model="tiny") - elif f0_method == "mangio-crepe": - f0 = self.get_f0_crepe_computation( - x, f0_min, f0_max, p_len, crepe_hop_length=crepe_hop_length - ) - elif f0_method == "mangio-crepe-tiny": - f0 = self.get_f0_crepe_computation( - x, f0_min, f0_max, p_len, crepe_hop_length=crepe_hop_length, model="tiny" - ) - elif f0_method == "rmvpe": - if not hasattr(self, "model_rmvpe"): - from lib.infer.infer_libs.rmvpe import RMVPE - - logger.info( - f"Loading rmvpe model, {os.environ['rmvpe_model_path']}" - ) - self.model_rmvpe = RMVPE( - os.environ["rmvpe_model_path"], - is_half=self.is_half, - device=self.device, - ) - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) - - if "privateuseone" in str(self.device): # clean ortruntime memory - del self.model_rmvpe.model - del self.model_rmvpe - logger.info("Cleaning ortruntime memory") - elif f0_method == "rmvpe+": - params = {'x': x, 'p_len': p_len, 'f0_up_key': f0_up_key, 'f0_min': f0_min, - 'f0_max': f0_max, 'time_step': time_step, 'filter_radius': filter_radius, - 'crepe_hop_length': crepe_hop_length, 'model': "full" - } - f0 = self.get_pitch_dependant_rmvpe(**params) - elif f0_method == "pyin": - f0 = self.get_f0_pyin_computation(input_audio_path, f0_min, f0_max) - elif f0_method == "fcpe_legacy": - f0 = self.get_fcpe(x, f0_min=f0_min, f0_max=f0_max, p_len=p_len) - elif f0_method == "fcpe": - f0 = self.get_torchfcpe(x, self.sr, f0_min, f0_max, p_len) - elif "hybrid" in f0_method: - # Perform hybrid median pitch estimation - input_audio_path2wav[input_audio_path] = x.astype(np.double) - f0 = self.get_f0_hybrid_computation( - f0_method, - input_audio_path, - x, - f0_min, - f0_max, - p_len, - filter_radius, - crepe_hop_length, - time_step, - ) - #print("Autotune:", f0_autotune) - if f0_autotune == True: - print("Autotune:", f0_autotune) - f0 = self.autotune_f0(f0) - - f0 *= pow(2, f0_up_key / 12) - # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) - tf0 = self.sr // self.window # 每秒f0点数 - if inp_f0 is not None: - delta_t = np.round( - (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 - ).astype("int16") - replace_f0 = np.interp( - list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] - ) - shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] - f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ - :shape - ] - # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(np.int32) - return f0_coarse, f0bak # 1-0 - - def vc( - self, - model, - net_g, - sid, - audio0, - pitch, - pitchf, - times, - index, - big_npy, - index_rate, - version, - protect, - ): # ,file_index,file_big_npy - feats = torch.from_numpy(audio0) - if self.is_half: - feats = feats.half() - else: - feats = feats.float() - if feats.dim() == 2: # double channels - feats = feats.mean(-1) - assert feats.dim() == 1, feats.dim() - feats = feats.view(1, -1) - padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - - inputs = { - "source": feats.to(self.device), - "padding_mask": padding_mask, - "output_layer": 9 if version == "v1" else 12, - } - t0 = ttime() - with torch.no_grad(): - logits = model.extract_features(**inputs) - feats = model.final_proj(logits[0]) if version == "v1" else logits[0] - if protect < 0.5 and pitch is not None and pitchf is not None: - feats0 = feats.clone() - if ( - not isinstance(index, type(None)) - and not isinstance(big_npy, type(None)) - and index_rate != 0 - ): - npy = feats[0].cpu().numpy() - if self.is_half: - npy = npy.astype("float32") - - # _, I = index.search(npy, 1) - # npy = big_npy[I.squeeze()] - - score, ix = index.search(npy, k=8) - weight = np.square(1 / score) - weight /= weight.sum(axis=1, keepdims=True) - npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) - - if self.is_half: - npy = npy.astype("float16") - feats = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate - + (1 - index_rate) * feats - ) - - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - if protect < 0.5 and pitch is not None and pitchf is not None: - feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( - 0, 2, 1 - ) - t1 = ttime() - p_len = audio0.shape[0] // self.window - if feats.shape[1] < p_len: - p_len = feats.shape[1] - if pitch is not None and pitchf is not None: - pitch = pitch[:, :p_len] - pitchf = pitchf[:, :p_len] - - if protect < 0.5 and pitch is not None and pitchf is not None: - pitchff = pitchf.clone() - pitchff[pitchf > 0] = 1 - pitchff[pitchf < 1] = protect - pitchff = pitchff.unsqueeze(-1) - feats = feats * pitchff + feats0 * (1 - pitchff) - feats = feats.to(feats0.dtype) - p_len = torch.tensor([p_len], device=self.device).long() - with torch.no_grad(): - hasp = pitch is not None and pitchf is not None - arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid) - audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy() - del hasp, arg - del feats, p_len, padding_mask - if torch.cuda.is_available(): - torch.cuda.empty_cache() - t2 = ttime() - times[0] += t1 - t0 - times[2] += t2 - t1 - return audio1 - def process_t(self, t, s, window, audio_pad, pitch, pitchf, times, index, big_npy, index_rate, version, protect, t_pad_tgt, if_f0, sid, model, net_g): - t = t // window * window - if if_f0 == 1: - return self.vc( - model, - net_g, - sid, - audio_pad[s : t + t_pad_tgt + window], - pitch[:, s // window : (t + t_pad_tgt) // window], - pitchf[:, s // window : (t + t_pad_tgt) // window], - times, - index, - big_npy, - index_rate, - version, - protect, - )[t_pad_tgt : -t_pad_tgt] - else: - return self.vc( - model, - net_g, - sid, - audio_pad[s : t + t_pad_tgt + window], - None, - None, - times, - index, - big_npy, - index_rate, - version, - protect, - )[t_pad_tgt : -t_pad_tgt] - - - def pipeline( - self, - model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - crepe_hop_length, - f0_autotune, - f0_min=50, - f0_max=1100 - ): - if ( - file_index != "" - and isinstance(file_index, str) - # and file_big_npy != "" - # and os.path.exists(file_big_npy) == True - and os.path.exists(file_index) - and index_rate != 0 - ): - try: - index = faiss.read_index(file_index) - # big_npy = np.load(file_big_npy) - big_npy = index.reconstruct_n(0, index.ntotal) - except: - traceback.print_exc() - index = big_npy = None - else: - index = big_npy = None - audio = signal.filtfilt(bh, ah, audio) - audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") - opt_ts = [] - if audio_pad.shape[0] > self.t_max: - audio_sum = np.zeros_like(audio) - for i in range(self.window): - audio_sum += audio_pad[i : i - self.window] - for t in range(self.t_center, audio.shape[0], self.t_center): - opt_ts.append( - t - - self.t_query - + np.where( - np.abs(audio_sum[t - self.t_query : t + self.t_query]) - == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() - )[0][0] - ) - s = 0 - audio_opt = [] - t = None - t1 = ttime() - audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") - p_len = audio_pad.shape[0] // self.window - inp_f0 = None - - sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() - pitch, pitchf = None, None - if if_f0: - pitch, pitchf = self.get_f0( - input_audio_path, - audio_pad, - p_len, - f0_up_key, - f0_method, - filter_radius, - crepe_hop_length, - f0_autotune, - inp_f0, - f0_min, - f0_max - ) - pitch = pitch[:p_len] - pitchf = pitchf[:p_len] - if "mps" not in str(self.device) or "xpu" not in str(self.device): - pitchf = pitchf.astype(np.float32) - pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() - pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() - t2 = ttime() - times[1] += t2 - t1 - - with tqdm(total=len(opt_ts), desc="Processing", unit="window") as pbar: - for i, t in enumerate(opt_ts): - t = t // self.window * self.window - start = s - end = t + self.t_pad2 + self.window - audio_slice = audio_pad[start:end] - pitch_slice = pitch[:, start // self.window:end // self.window] if if_f0 else None - pitchf_slice = pitchf[:, start // self.window:end // self.window] if if_f0 else None - audio_opt.append(self.vc(model, net_g, sid, audio_slice, pitch_slice, pitchf_slice, times, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt]) - s = t - pbar.update(1) - pbar.refresh() - - audio_slice = audio_pad[t:] - pitch_slice = pitch[:, t // self.window:] if if_f0 and t is not None else pitch - pitchf_slice = pitchf[:, t // self.window:] if if_f0 and t is not None else pitchf - audio_opt.append(self.vc(model, net_g, sid, audio_slice, pitch_slice, pitchf_slice, times, index, big_npy, index_rate, version, protect)[self.t_pad_tgt : -self.t_pad_tgt]) - - audio_opt = np.concatenate(audio_opt) - if rms_mix_rate != 1: - audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) - if tgt_sr != resample_sr >= 16000: - audio_opt = librosa.resample( - audio_opt, orig_sr=tgt_sr, target_sr=resample_sr - ) - audio_max = np.abs(audio_opt).max() / 0.99 - max_int16 = 32768 - if audio_max > 1: - max_int16 /= audio_max - audio_opt = (audio_opt * max_int16).astype(np.int16) - del pitch, pitchf, sid - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - print("Returning completed audio...") - return audio_opt \ No newline at end of file diff --git a/lib/split_audio.py b/lib/split_audio.py deleted file mode 100644 index 90e52c9a..00000000 --- a/lib/split_audio.py +++ /dev/null @@ -1,91 +0,0 @@ -import os -from pydub import AudioSegment -from pydub.silence import detect_silence, detect_nonsilent - -SEPERATE_DIR = os.path.join(os.getcwd(), "seperate") -TEMP_DIR = os.path.join(SEPERATE_DIR, "temp") -cache = {} - -os.makedirs(SEPERATE_DIR, exist_ok=True) -os.makedirs(TEMP_DIR, exist_ok=True) - -def cache_result(func): - def wrapper(*args, **kwargs): - key = (args, frozenset(kwargs.items())) - if key in cache: - return cache[key] - else: - result = func(*args, **kwargs) - cache[key] = result - return result - return wrapper - -def get_non_silent(audio_name, audio, min_silence, silence_thresh, seek_step, keep_silence): - """ - Function to get non-silent parts of the audio. - """ - nonsilent_ranges = detect_nonsilent(audio, min_silence_len=min_silence, silence_thresh=silence_thresh, seek_step=seek_step) - nonsilent_files = [] - for index, range in enumerate(nonsilent_ranges): - nonsilent_name = os.path.join(SEPERATE_DIR, f"{audio_name}_min{min_silence}_t{silence_thresh}_ss{seek_step}_ks{keep_silence}", f"nonsilent{index}-{audio_name}.wav") - start, end = range[0] - keep_silence, range[1] + keep_silence - audio[start:end].export(nonsilent_name, format="wav") - nonsilent_files.append(nonsilent_name) - return nonsilent_files - -def get_silence(audio_name, audio, min_silence, silence_thresh, seek_step, keep_silence): - """ - Function to get silent parts of the audio. - """ - silence_ranges = detect_silence(audio, min_silence_len=min_silence, silence_thresh=silence_thresh, seek_step=seek_step) - silence_files = [] - for index, range in enumerate(silence_ranges): - silence_name = os.path.join(SEPERATE_DIR, f"{audio_name}_min{min_silence}_t{silence_thresh}_ss{seek_step}_ks{keep_silence}", f"silence{index}-{audio_name}.wav") - start, end = range[0] + keep_silence, range[1] - keep_silence - audio[start:end].export(silence_name, format="wav") - silence_files.append(silence_name) - return silence_files - -@cache_result -def split_silence_nonsilent(input_path, min_silence=500, silence_thresh=-40, seek_step=1, keep_silence=100): - """ - Function to split the audio into silent and non-silent parts. - """ - audio_name = os.path.splitext(os.path.basename(input_path))[0] - os.makedirs(os.path.join(SEPERATE_DIR, f"{audio_name}_min{min_silence}_t{silence_thresh}_ss{seek_step}_ks{keep_silence}"), exist_ok=True) - audio = AudioSegment.silent(duration=1000) + AudioSegment.from_file(input_path) + AudioSegment.silent(duration=1000) - silence_files = get_silence(audio_name, audio, min_silence, silence_thresh, seek_step, keep_silence) - nonsilent_files = get_non_silent(audio_name, audio, min_silence, silence_thresh, seek_step, keep_silence) - return silence_files, nonsilent_files - -def adjust_audio_lengths(original_audios, inferred_audios): - """ - Function to adjust the lengths of the inferred audio files list to match the original audio files length. - """ - adjusted_audios = [] - for original_audio, inferred_audio in zip(original_audios, inferred_audios): - audio_1 = AudioSegment.from_file(original_audio) - audio_2 = AudioSegment.from_file(inferred_audio) - - if len(audio_1) > len(audio_2): - audio_2 += AudioSegment.silent(duration=len(audio_1) - len(audio_2)) - else: - audio_2 = audio_2[:len(audio_1)] - - adjusted_file = os.path.join(TEMP_DIR, f"adjusted-{os.path.basename(inferred_audio)}") - audio_2.export(adjusted_file, format="wav") - adjusted_audios.append(adjusted_file) - - return adjusted_audios - -def combine_silence_nonsilent(silence_files, nonsilent_files, keep_silence, output): - """ - Function to combine the silent and non-silent parts of the audio. - """ - combined = AudioSegment.empty() - for silence, nonsilent in zip(silence_files, nonsilent_files): - combined += AudioSegment.from_wav(silence) + AudioSegment.from_wav(nonsilent) - combined += AudioSegment.from_wav(silence_files[-1]) - combined = AudioSegment.silent(duration=keep_silence) + combined[1000:-1000] + AudioSegment.silent(duration=keep_silence) - combined.export(output, format="wav") - return output \ No newline at end of file diff --git a/logs/mute/f0/mute.wav.npy b/logs/mute/f0/mute.wav.npy new file mode 100644 index 00000000..a7ecfbf9 Binary files /dev/null and b/logs/mute/f0/mute.wav.npy differ diff --git a/logs/mute/f0_voiced/mute.wav.npy b/logs/mute/f0_voiced/mute.wav.npy new file mode 100644 index 00000000..cf5c21bd Binary files /dev/null and b/logs/mute/f0_voiced/mute.wav.npy differ diff --git a/logs/mute/sliced_audios/mute32000.wav b/logs/mute/sliced_audios/mute32000.wav new file mode 100644 index 00000000..b4b50292 Binary files /dev/null and b/logs/mute/sliced_audios/mute32000.wav differ diff --git a/logs/mute/sliced_audios/mute40000.wav b/logs/mute/sliced_audios/mute40000.wav new file mode 100644 index 00000000..fcf1281d Binary files /dev/null and b/logs/mute/sliced_audios/mute40000.wav differ diff --git a/logs/mute/sliced_audios/mute48000.wav b/logs/mute/sliced_audios/mute48000.wav new file mode 100644 index 00000000..72822a01 Binary files /dev/null and b/logs/mute/sliced_audios/mute48000.wav differ diff --git a/logs/mute/sliced_audios_16k/mute.wav b/logs/mute/sliced_audios_16k/mute.wav new file mode 100644 index 00000000..27a7d638 Binary files /dev/null and b/logs/mute/sliced_audios_16k/mute.wav differ diff --git a/logs/mute/v1_extracted/mute.npy b/logs/mute/v1_extracted/mute.npy new file mode 100644 index 00000000..ffe35e78 Binary files /dev/null and b/logs/mute/v1_extracted/mute.npy differ diff --git a/logs/mute/v2_extracted/mute.npy b/logs/mute/v2_extracted/mute.npy new file mode 100644 index 00000000..b14cfb83 Binary files /dev/null and b/logs/mute/v2_extracted/mute.npy differ diff --git a/models.py b/models.py index 7401db5f..ebe76c7c 100644 --- a/models.py +++ b/models.py @@ -1,37 +1,5 @@ -import os -import requests -from pathlib import Path +from rvc.lib.tools.prerequisites_download import prerequisites_download_pipeline -# Function to download file -def download_file(url, dest_path): - try: - response = requests.get(url, stream=True) - response.raise_for_status() # Check if the request was successful - with open(dest_path, 'wb') as file: - for chunk in response.iter_content(chunk_size=8192): - file.write(chunk) - - print(f"Successfully downloaded {dest_path}") - - except requests.exceptions.RequestException as e: - print(f"Error downloading {url}: {e}") - -# Directory structure -base_dir = "assets" -directories = ["fcpe", "hubert", "rmvpe"] -urls = [ - "https://huggingface.co/datasets/ylzz1997/rmvpe_pretrain_model/resolve/main/fcpe.pt", - "https://huggingface.co/Kit-Lemonfoot/RVC_DidntAsk/resolve/main/hubert_base.pt", - "https://huggingface.co/Kit-Lemonfoot/RVC_DidntAsk/resolve/main/rmvpe.pt" -] - -# Ensure directories exist -for directory in directories: - os.makedirs(Path(base_dir) / directory, exist_ok=True) - -# Download the files -for url, directory in zip(urls, directories): - file_name = url.split("/")[-1] - dest_path = Path(base_dir) / directory / file_name - download_file(url, dest_path) +print("downloading models...") +prerequisites_download_pipeline(models=True, exe=True) diff --git a/requirements.txt b/requirements.txt index ebfaa174..07150ff9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,33 @@ -av +pip>=23.3; sys_platform == 'darwin' +wheel; sys_platform == 'darwin' +PyYAML; sys_platform == 'darwin' +tqdm +wget ffmpeg-python>=0.2.0 -faiss_cpu==1.7.3 -praat-parselmouth==0.4.2 -pyworld==0.3.4 -resampy==0.4.2 -fairseq==0.12.2 -pydub==0.25.1 -einops -local_attention -torchcrepe==0.0.20 -torchfcpe +faiss-cpu==1.7.3 +soundfile==0.12.1 +noisereduce +pedalboard +stftpitchshift yt-dlp audio-separator[gpu] -edge-tts -gradio==4.40.0 +omegaconf>=2.0.6; sys_platform == 'darwin' +numba; sys_platform == 'linux' +numba==0.57.0; sys_platform == 'darwin' or sys_platform == 'win32' +torchaudio==2.3.1 +torchvision==0.18.1 +torchcrepe==0.0.23 +torchfcpe +libf0 +transformers==4.44.2 +matplotlib==3.7.2 +tensorboard +gradio==4.43.0 +certifi>=2023.07.22; sys_platform == 'darwin' +antlr4-python3-runtime==4.8; sys_platform == 'darwin' +tensorboardX +edge-tts==6.1.9 +pypresence +beautifulsoup4 +flask + diff --git a/rvc/configs/config.py b/rvc/configs/config.py new file mode 100644 index 00000000..e6490936 --- /dev/null +++ b/rvc/configs/config.py @@ -0,0 +1,179 @@ +import torch +import json +import os + + +version_config_paths = [ + os.path.join("v1", "32000.json"), + os.path.join("v1", "40000.json"), + os.path.join("v1", "48000.json"), + os.path.join("v2", "48000.json"), + os.path.join("v2", "40000.json"), + os.path.join("v2", "32000.json"), +] + + +def singleton(cls): + instances = {} + + def get_instance(*args, **kwargs): + if cls not in instances: + instances[cls] = cls(*args, **kwargs) + return instances[cls] + + return get_instance + + +@singleton +class Config: + def __init__(self): + self.device = "cuda:0" if torch.cuda.is_available() else "cpu" + self.is_half = self.device != "cpu" + self.gpu_name = ( + torch.cuda.get_device_name(int(self.device.split(":")[-1])) + if self.device.startswith("cuda") + else None + ) + self.json_config = self.load_config_json() + self.gpu_mem = None + self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() + + def load_config_json(self) -> dict: + configs = {} + for config_file in version_config_paths: + config_path = os.path.join("rvc", "configs", config_file) + with open(config_path, "r") as f: + configs[config_file] = json.load(f) + return configs + + def has_mps(self) -> bool: + # Check if Metal Performance Shaders are available - for macOS 12.3+. + return torch.backends.mps.is_available() + + def has_xpu(self) -> bool: + # Check if XPU is available. + return hasattr(torch, "xpu") and torch.xpu.is_available() + + def set_precision(self, precision): + if precision not in ["fp32", "fp16"]: + raise ValueError("Invalid precision type. Must be 'fp32' or 'fp16'.") + + fp16_run_value = precision == "fp16" + preprocess_target_version = "3.7" if precision == "fp16" else "3.0" + preprocess_path = os.path.join( + os.path.dirname(__file__), + os.pardir, + "rvc", + "train", + "preprocess", + "preprocess.py", + ) + + for config_path in version_config_paths: + full_config_path = os.path.join("rvc", "configs", config_path) + try: + with open(full_config_path, "r") as f: + config = json.load(f) + config["train"]["fp16_run"] = fp16_run_value + with open(full_config_path, "w") as f: + json.dump(config, f, indent=4) + except FileNotFoundError: + print(f"File not found: {full_config_path}") + + if os.path.exists(preprocess_path): + with open(preprocess_path, "r") as f: + preprocess_content = f.read() + preprocess_content = preprocess_content.replace( + "3.0" if precision == "fp16" else "3.7", preprocess_target_version + ) + with open(preprocess_path, "w") as f: + f.write(preprocess_content) + + return f"Overwritten preprocess and config.json to use {precision}." + + def get_precision(self): + if not version_config_paths: + raise FileNotFoundError("No configuration paths provided.") + + full_config_path = os.path.join("rvc", "configs", version_config_paths[0]) + try: + with open(full_config_path, "r") as f: + config = json.load(f) + fp16_run_value = config["train"].get("fp16_run", False) + precision = "fp16" if fp16_run_value else "fp32" + return precision + except FileNotFoundError: + print(f"File not found: {full_config_path}") + return None + + def device_config(self) -> tuple: + if self.device.startswith("cuda"): + self.set_cuda_config() + elif self.has_mps(): + self.device = "mps" + self.is_half = False + self.set_precision("fp32") + else: + self.device = "cpu" + self.is_half = False + self.set_precision("fp32") + + # Configuration for 6GB GPU memory + x_pad, x_query, x_center, x_max = ( + (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41) + ) + if self.gpu_mem is not None and self.gpu_mem <= 4: + # Configuration for 5GB GPU memory + x_pad, x_query, x_center, x_max = (1, 5, 30, 32) + + return x_pad, x_query, x_center, x_max + + def set_cuda_config(self): + i_device = int(self.device.split(":")[-1]) + self.gpu_name = torch.cuda.get_device_name(i_device) + low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"] + if ( + any(gpu in self.gpu_name for gpu in low_end_gpus) + and "V100" not in self.gpu_name.upper() + ): + self.is_half = False + self.set_precision("fp32") + + self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // ( + 1024**3 + ) + + +def max_vram_gpu(gpu): + if torch.cuda.is_available(): + gpu_properties = torch.cuda.get_device_properties(gpu) + total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024) + return total_memory_gb + else: + return "8" + + +def get_gpu_info(): + ngpu = torch.cuda.device_count() + gpu_infos = [] + if torch.cuda.is_available() or ngpu != 0: + for i in range(ngpu): + gpu_name = torch.cuda.get_device_name(i) + mem = int( + torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + + 0.4 + ) + gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)") + if len(gpu_infos) > 0: + gpu_info = "\n".join(gpu_infos) + else: + gpu_info = "Unfortunately, there is no compatible GPU available to support your training." + return gpu_info + + +def get_number_of_gpus(): + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + return "-".join(map(str, range(num_gpus))) + else: + return "-" diff --git a/rvc/configs/v1/32000.json b/rvc/configs/v1/32000.json new file mode 100644 index 00000000..2f28f4f6 --- /dev/null +++ b/rvc/configs/v1/32000.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 32000, + "filter_length": 1024, + "hop_length": 320, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,4,2,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/rvc/configs/v1/40000.json b/rvc/configs/v1/40000.json new file mode 100644 index 00000000..3961ddb6 --- /dev/null +++ b/rvc/configs/v1/40000.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 40000, + "filter_length": 2048, + "hop_length": 400, + "win_length": 2048, + "n_mel_channels": 125, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/rvc/configs/v1/48000.json b/rvc/configs/v1/48000.json new file mode 100644 index 00000000..41ea3b62 --- /dev/null +++ b/rvc/configs/v1/48000.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 11520, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 48000, + "filter_length": 2048, + "hop_length": 480, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,6,2,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/rvc/configs/v2/32000.json b/rvc/configs/v2/32000.json new file mode 100644 index 00000000..eabab7b5 --- /dev/null +++ b/rvc/configs/v2/32000.json @@ -0,0 +1,43 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 32000, + "filter_length": 1024, + "hop_length": 320, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,8,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [20,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/rvc/configs/v2/40000.json b/rvc/configs/v2/40000.json new file mode 100644 index 00000000..e1ba44a9 --- /dev/null +++ b/rvc/configs/v2/40000.json @@ -0,0 +1,43 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 40000, + "filter_length": 2048, + "hop_length": 400, + "win_length": 2048, + "n_mel_channels": 125, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/rvc/configs/v2/48000.json b/rvc/configs/v2/48000.json new file mode 100644 index 00000000..1a4da9f5 --- /dev/null +++ b/rvc/configs/v2/48000.json @@ -0,0 +1,43 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 17280, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 48000, + "filter_length": 2048, + "hop_length": 480, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [12,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [24,20,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py new file mode 100644 index 00000000..ae78283d --- /dev/null +++ b/rvc/infer/infer.py @@ -0,0 +1,495 @@ +import os +import sys +import time +import torch +import librosa +import logging +import traceback +import numpy as np +import soundfile as sf +import noisereduce as nr +from pedalboard import ( + Pedalboard, + Chorus, + Distortion, + Reverb, + PitchShift, + Limiter, + Gain, + Bitcrush, + Clipping, + Compressor, + Delay, +) + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from rvc.infer.pipeline import Pipeline as VC +from rvc.lib.utils import load_audio_infer, load_embedding +from rvc.lib.tools.split_audio import process_audio, merge_audio +from rvc.lib.algorithm.synthesizers import Synthesizer +from rvc.configs.config import Config + +logging.getLogger("httpx").setLevel(logging.WARNING) +logging.getLogger("httpcore").setLevel(logging.WARNING) +logging.getLogger("faiss").setLevel(logging.WARNING) +logging.getLogger("faiss.loader").setLevel(logging.WARNING) + + +class VoiceConverter: + """ + A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method. + """ + + def __init__(self): + """ + Initializes the VoiceConverter with default configuration, and sets up models and parameters. + """ + self.config = Config() # Load RVC configuration + self.hubert_model = ( + None # Initialize the Hubert model (for embedding extraction) + ) + self.last_embedder_model = None # Last used embedder model + self.tgt_sr = None # Target sampling rate for the output audio + self.net_g = None # Generator network for voice conversion + self.vc = None # Voice conversion pipeline instance + self.cpt = None # Checkpoint for loading model weights + self.version = None # Model version + self.n_spk = None # Number of speakers in the model + self.use_f0 = None # Whether the model uses F0 + self.loaded_model = None + + def load_hubert(self, embedder_model: str, embedder_model_custom: str = None): + """ + Loads the HuBERT model for speaker embedding extraction. + + Args: + embedder_model (str): Path to the pre-trained HuBERT model. + embedder_model_custom (str): Path to the custom HuBERT model. + """ + self.hubert_model = load_embedding(embedder_model, embedder_model_custom) + self.hubert_model.to(self.config.device) + self.hubert_model = ( + self.hubert_model.half() + if self.config.is_half + else self.hubert_model.float() + ) + self.hubert_model.eval() + + @staticmethod + def remove_audio_noise(data, sr, reduction_strength=0.7): + """ + Removes noise from an audio file using the NoiseReduce library. + + Args: + data (numpy.ndarray): The audio data as a NumPy array. + sr (int): The sample rate of the audio data. + reduction_strength (float): Strength of the noise reduction. Default is 0.7. + """ + try: + reduced_noise = nr.reduce_noise( + y=data, sr=sr, prop_decrease=reduction_strength + ) + return reduced_noise + except Exception as error: + print(f"An error occurred removing audio noise: {error}") + return None + + @staticmethod + def convert_audio_format(input_path, output_path, output_format): + """ + Converts an audio file to a specified output format. + + Args: + input_path (str): Path to the input audio file. + output_path (str): Path to the output audio file. + output_format (str): Desired audio format (e.g., "WAV", "MP3"). + """ + try: + if output_format != "WAV": + print(f"Saving audio as {output_format}...") + audio, sample_rate = librosa.load(input_path, sr=None) + common_sample_rates = [ + 8000, + 11025, + 12000, + 16000, + 22050, + 24000, + 32000, + 44100, + 48000, + ] + target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate)) + audio = librosa.resample( + audio, orig_sr=sample_rate, target_sr=target_sr + ) + sf.write(output_path, audio, target_sr, format=output_format.lower()) + return output_path + except Exception as error: + print(f"An error occurred converting the audio format: {error}") + + @staticmethod + def post_process_audio( + audio_input, + sample_rate, + **kwargs, + ): + board = Pedalboard() + if kwargs.get("reverb", False): + reverb = Reverb( + room_size=kwargs.get("reverb_room_size", 0.5), + damping=kwargs.get("reverb_damping", 0.5), + wet_level=kwargs.get("reverb_wet_level", 0.33), + dry_level=kwargs.get("reverb_dry_level", 0.4), + width=kwargs.get("reverb_width", 1.0), + freeze_mode=kwargs.get("reverb_freeze_mode", 0), + ) + board.append(reverb) + if kwargs.get("pitch_shift", False): + pitch_shift = PitchShift(semitones=kwargs.get("pitch_shift_semitones", 0)) + board.append(pitch_shift) + if kwargs.get("limiter", False): + limiter = Limiter( + threshold_db=kwargs.get("limiter_threshold", -6), + release_ms=kwargs.get("limiter_release", 0.05), + ) + board.append(limiter) + if kwargs.get("gain", False): + gain = Gain(gain_db=kwargs.get("gain_db", 0)) + board.append(gain) + if kwargs.get("distortion", False): + distortion = Distortion(drive_db=kwargs.get("distortion_gain", 25)) + board.append(distortion) + if kwargs.get("chorus", False): + chorus = Chorus( + rate_hz=kwargs.get("chorus_rate", 1.0), + depth=kwargs.get("chorus_depth", 0.25), + centre_delay_ms=kwargs.get("chorus_delay", 7), + feedback=kwargs.get("chorus_feedback", 0.0), + mix=kwargs.get("chorus_mix", 0.5), + ) + board.append(chorus) + if kwargs.get("bitcrush", False): + bitcrush = Bitcrush(bit_depth=kwargs.get("bitcrush_bit_depth", 8)) + board.append(bitcrush) + if kwargs.get("clipping", False): + clipping = Clipping(threshold_db=kwargs.get("clipping_threshold", 0)) + board.append(clipping) + if kwargs.get("compressor", False): + compressor = Compressor( + threshold_db=kwargs.get("compressor_threshold", 0), + ratio=kwargs.get("compressor_ratio", 1), + attack_ms=kwargs.get("compressor_attack", 1.0), + release_ms=kwargs.get("compressor_release", 100), + ) + board.append(compressor) + if kwargs.get("delay", False): + delay = Delay( + delay_seconds=kwargs.get("delay_seconds", 0.5), + feedback=kwargs.get("delay_feedback", 0.0), + mix=kwargs.get("delay_mix", 0.5), + ) + board.append(delay) + return board(audio_input, sample_rate) + + def convert_audio( + self, + audio_input_path: str, + audio_output_path: str, + model_path: str, + index_path: str, + pitch: int = 0, + f0_file: str = None, + f0_method: str = "rmvpe", + index_rate: float = 0.75, + volume_envelope: float = 1, + protect: float = 0.5, + hop_length: int = 128, + split_audio: bool = False, + f0_autotune: bool = False, + f0_autotune_strength: float = 1, + filter_radius: int = 3, + embedder_model: str = "contentvec", + embedder_model_custom: str = None, + clean_audio: bool = False, + clean_strength: float = 0.5, + export_format: str = "WAV", + upscale_audio: bool = False, + post_process: bool = False, + resample_sr: int = 0, + sid: int = 0, + **kwargs, + ): + """ + Performs voice conversion on the input audio. + + Args: + pitch (int): Key for F0 up-sampling. + filter_radius (int): Radius for filtering. + index_rate (float): Rate for index matching. + volume_envelope (int): RMS mix rate. + protect (float): Protection rate for certain audio segments. + hop_length (int): Hop length for audio processing. + f0_method (str): Method for F0 extraction. + audio_input_path (str): Path to the input audio file. + audio_output_path (str): Path to the output audio file. + model_path (str): Path to the voice conversion model. + index_path (str): Path to the index file. + split_audio (bool): Whether to split the audio for processing. + f0_autotune (bool): Whether to use F0 autotune. + clean_audio (bool): Whether to clean the audio. + clean_strength (float): Strength of the audio cleaning. + export_format (str): Format for exporting the audio. + upscale_audio (bool): Whether to upscale the audio. + f0_file (str): Path to the F0 file. + embedder_model (str): Path to the embedder model. + embedder_model_custom (str): Path to the custom embedder model. + resample_sr (int, optional): Resample sampling rate. Default is 0. + sid (int, optional): Speaker ID. Default is 0. + **kwargs: Additional keyword arguments. + """ + self.get_vc(model_path, sid) + try: + start_time = time.time() + print(f"Converting audio '{audio_input_path}'...") + + audio = load_audio_infer( + audio_input_path, + 16000, + **kwargs, + ) + audio_max = np.abs(audio).max() / 0.95 + + if audio_max > 1: + audio /= audio_max + + if not self.hubert_model or embedder_model != self.last_embedder_model: + self.load_hubert(embedder_model, embedder_model_custom) + self.last_embedder_model = embedder_model + + file_index = ( + index_path.strip() + .strip('"') + .strip("\n") + .strip('"') + .strip() + .replace("trained", "added") + ) + + if self.tgt_sr != resample_sr >= 16000: + self.tgt_sr = resample_sr + + if split_audio: + chunks, intervals = process_audio(audio, 16000) + print(f"Audio split into {len(chunks)} chunks for processing.") + else: + chunks = [] + chunks.append(audio) + + converted_chunks = [] + for c in chunks: + audio_opt = self.vc.pipeline( + model=self.hubert_model, + net_g=self.net_g, + sid=sid, + audio=c, + pitch=pitch, + f0_method=f0_method, + file_index=file_index, + index_rate=index_rate, + pitch_guidance=self.use_f0, + filter_radius=filter_radius, + volume_envelope=volume_envelope, + version=self.version, + protect=protect, + hop_length=hop_length, + f0_autotune=f0_autotune, + f0_autotune_strength=f0_autotune_strength, + f0_file=f0_file, + ) + converted_chunks.append(audio_opt) + if split_audio: + print(f"Converted audio chunk {len(converted_chunks)}") + + if split_audio: + audio_opt = merge_audio(converted_chunks, intervals, 16000, self.tgt_sr) + else: + audio_opt = converted_chunks[0] + + if clean_audio: + cleaned_audio = self.remove_audio_noise( + audio_opt, self.tgt_sr, clean_strength + ) + if cleaned_audio is not None: + audio_opt = cleaned_audio + + if post_process: + audio_opt = self.post_process_audio( + audio_input=audio_opt, + sample_rate=self.tgt_sr, + **kwargs, + ) + + sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV") + output_path_format = audio_output_path.replace( + ".wav", f".{export_format.lower()}" + ) + audio_output_path = self.convert_audio_format( + audio_output_path, output_path_format, export_format + ) + + elapsed_time = time.time() - start_time + print( + f"Conversion completed at '{audio_output_path}' in {elapsed_time:.2f} seconds." + ) + except Exception as error: + print(f"An error occurred during audio conversion: {error}") + print(traceback.format_exc()) + + def convert_audio_batch( + self, + audio_input_paths: str, + audio_output_path: str, + **kwargs, + ): + """ + Performs voice conversion on a batch of input audio files. + + Args: + audio_input_paths (str): List of paths to the input audio files. + audio_output_path (str): Path to the output audio file. + resample_sr (int, optional): Resample sampling rate. Default is 0. + sid (int, optional): Speaker ID. Default is 0. + **kwargs: Additional keyword arguments. + """ + pid = os.getpid() + try: + with open( + os.path.join(now_dir, "assets", "infer_pid.txt"), "w" + ) as pid_file: + pid_file.write(str(pid)) + start_time = time.time() + print(f"Converting audio batch '{audio_input_paths}'...") + audio_files = [ + f + for f in os.listdir(audio_input_paths) + if f.endswith( + ( + "wav", + "mp3", + "flac", + "ogg", + "opus", + "m4a", + "mp4", + "aac", + "alac", + "wma", + "aiff", + "webm", + "ac3", + ) + ) + ] + print(f"Detected {len(audio_files)} audio files for inference.") + for a in audio_files: + new_input = os.path.join(audio_input_paths, a) + new_output = os.path.splitext(a)[0] + "_output.wav" + new_output = os.path.join(audio_output_path, new_output) + if os.path.exists(new_output): + continue + self.convert_audio( + audio_input_path=new_input, + audio_output_path=new_output, + **kwargs, + ) + print(f"Conversion completed at '{audio_input_paths}'.") + elapsed_time = time.time() - start_time + print(f"Batch conversion completed in {elapsed_time:.2f} seconds.") + except Exception as error: + print(f"An error occurred during audio batch conversion: {error}") + print(traceback.format_exc()) + finally: + os.remove(os.path.join(now_dir, "assets", "infer_pid.txt")) + + def get_vc(self, weight_root, sid): + """ + Loads the voice conversion model and sets up the pipeline. + + Args: + weight_root (str): Path to the model weights. + sid (int): Speaker ID. + """ + if sid == "" or sid == []: + self.cleanup_model() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + if not self.loaded_model or self.loaded_model != weight_root: + self.load_model(weight_root) + if self.cpt is not None: + self.setup_network() + self.setup_vc_instance() + self.loaded_model = weight_root + + def cleanup_model(self): + """ + Cleans up the model and releases resources. + """ + if self.hubert_model is not None: + del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr + self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + del self.net_g, self.cpt + if torch.cuda.is_available(): + torch.cuda.empty_cache() + self.cpt = None + + def load_model(self, weight_root): + """ + Loads the model weights from the specified path. + + Args: + weight_root (str): Path to the model weights. + """ + self.cpt = ( + torch.load(weight_root, map_location="cpu") + if os.path.isfile(weight_root) + else None + ) + + def setup_network(self): + """ + Sets up the network configuration based on the loaded checkpoint. + """ + if self.cpt is not None: + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] + self.use_f0 = self.cpt.get("f0", 1) + + self.version = self.cpt.get("version", "v1") + self.text_enc_hidden_dim = 768 if self.version == "v2" else 256 + self.net_g = Synthesizer( + *self.cpt["config"], + use_f0=self.use_f0, + text_enc_hidden_dim=self.text_enc_hidden_dim, + is_half=self.config.is_half, + ) + del self.net_g.enc_q + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + self.net_g.eval().to(self.config.device) + self.net_g = ( + self.net_g.half() if self.config.is_half else self.net_g.float() + ) + + def setup_vc_instance(self): + """ + Sets up the voice conversion pipeline instance based on the target sampling rate and configuration. + """ + if self.cpt is not None: + self.vc = VC(self.tgt_sr, self.config) + self.n_spk = self.cpt["config"][-3] diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py new file mode 100644 index 00000000..6f9e554e --- /dev/null +++ b/rvc/infer/pipeline.py @@ -0,0 +1,708 @@ +import os +import gc +import re +import sys +import torch +import torch.nn.functional as F +import torchcrepe +import faiss +import librosa +import numpy as np +from scipy import signal +from torch import Tensor + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from rvc.lib.predictors.RMVPE import RMVPE0Predictor +from rvc.lib.predictors.FCPE import FCPEF0Predictor + +import logging + +logging.getLogger("faiss").setLevel(logging.WARNING) + +# Constants for high-pass filter +FILTER_ORDER = 5 +CUTOFF_FREQUENCY = 48 # Hz +SAMPLE_RATE = 16000 # Hz +bh, ah = signal.butter( + N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE +) + +input_audio_path2wav = {} + + +class AudioProcessor: + """ + A class for processing audio signals, specifically for adjusting RMS levels. + """ + + def change_rms( + source_audio: np.ndarray, + source_rate: int, + target_audio: np.ndarray, + target_rate: int, + rate: float, + ) -> np.ndarray: + """ + Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate. + + Args: + source_audio: The source audio signal as a NumPy array. + source_rate: The sampling rate of the source audio. + target_audio: The target audio signal to adjust. + target_rate: The sampling rate of the target audio. + rate: The blending rate between the source and target RMS levels. + """ + # Calculate RMS of both audio data + rms1 = librosa.feature.rms( + y=source_audio, + frame_length=source_rate // 2 * 2, + hop_length=source_rate // 2, + ) + rms2 = librosa.feature.rms( + y=target_audio, + frame_length=target_rate // 2 * 2, + hop_length=target_rate // 2, + ) + + # Interpolate RMS to match target audio length + rms1 = F.interpolate( + torch.from_numpy(rms1).float().unsqueeze(0), + size=target_audio.shape[0], + mode="linear", + ).squeeze() + rms2 = F.interpolate( + torch.from_numpy(rms2).float().unsqueeze(0), + size=target_audio.shape[0], + mode="linear", + ).squeeze() + rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6) + + # Adjust target audio RMS based on the source audio RMS + adjusted_audio = ( + target_audio + * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy() + ) + return adjusted_audio + + +class Autotune: + """ + A class for applying autotune to a given fundamental frequency (F0) contour. + """ + + def __init__(self, ref_freqs): + """ + Initializes the Autotune class with a set of reference frequencies. + + Args: + ref_freqs: A list of reference frequencies representing musical notes. + """ + self.ref_freqs = ref_freqs + self.note_dict = self.ref_freqs # No interpolation needed + + def autotune_f0(self, f0, f0_autotune_strength): + """ + Autotunes a given F0 contour by snapping each frequency to the closest reference frequency. + + Args: + f0: The input F0 contour as a NumPy array. + """ + autotuned_f0 = np.zeros_like(f0) + for i, freq in enumerate(f0): + closest_note = min(self.note_dict, key=lambda x: abs(x - freq)) + autotuned_f0[i] = freq + (closest_note - freq) * f0_autotune_strength + return autotuned_f0 + + +class Pipeline: + """ + The main pipeline class for performing voice conversion, including preprocessing, F0 estimation, + voice conversion using a model, and post-processing. + """ + + def __init__(self, tgt_sr, config): + """ + Initializes the Pipeline class with target sampling rate and configuration parameters. + + Args: + tgt_sr: The target sampling rate for the output audio. + config: A configuration object containing various parameters for the pipeline. + """ + self.x_pad = config.x_pad + self.x_query = config.x_query + self.x_center = config.x_center + self.x_max = config.x_max + self.is_half = config.is_half + self.sample_rate = 16000 + self.window = 160 + self.t_pad = self.sample_rate * self.x_pad + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sample_rate * self.x_query + self.t_center = self.sample_rate * self.x_center + self.t_max = self.sample_rate * self.x_max + self.time_step = self.window / self.sample_rate * 1000 + self.f0_min = 50 + self.f0_max = 1100 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.device = config.device + self.ref_freqs = [ + 49.00, # G1 + 51.91, # G#1 / Ab1 + 55.00, # A1 + 58.27, # A#1 / Bb1 + 61.74, # B1 + 65.41, # C2 + 69.30, # C#2 / Db2 + 73.42, # D2 + 77.78, # D#2 / Eb2 + 82.41, # E2 + 87.31, # F2 + 92.50, # F#2 / Gb2 + 98.00, # G2 + 103.83, # G#2 / Ab2 + 110.00, # A2 + 116.54, # A#2 / Bb2 + 123.47, # B2 + 130.81, # C3 + 138.59, # C#3 / Db3 + 146.83, # D3 + 155.56, # D#3 / Eb3 + 164.81, # E3 + 174.61, # F3 + 185.00, # F#3 / Gb3 + 196.00, # G3 + 207.65, # G#3 / Ab3 + 220.00, # A3 + 233.08, # A#3 / Bb3 + 246.94, # B3 + 261.63, # C4 + 277.18, # C#4 / Db4 + 293.66, # D4 + 311.13, # D#4 / Eb4 + 329.63, # E4 + 349.23, # F4 + 369.99, # F#4 / Gb4 + 392.00, # G4 + 415.30, # G#4 / Ab4 + 440.00, # A4 + 466.16, # A#4 / Bb4 + 493.88, # B4 + 523.25, # C5 + 554.37, # C#5 / Db5 + 587.33, # D5 + 622.25, # D#5 / Eb5 + 659.25, # E5 + 698.46, # F5 + 739.99, # F#5 / Gb5 + 783.99, # G5 + 830.61, # G#5 / Ab5 + 880.00, # A5 + 932.33, # A#5 / Bb5 + 987.77, # B5 + 1046.50, # C6 + ] + self.autotune = Autotune(self.ref_freqs) + self.note_dict = self.autotune.note_dict + self.model_rmvpe = RMVPE0Predictor( + os.path.join("rvc", "models", "predictors", "rmvpe.pt"), + is_half=self.is_half, + device=self.device, + ) + + def get_f0_crepe( + self, + x, + f0_min, + f0_max, + p_len, + hop_length, + model="full", + ): + """ + Estimates the fundamental frequency (F0) of a given audio signal using the Crepe model. + + Args: + x: The input audio signal as a NumPy array. + f0_min: Minimum F0 value to consider. + f0_max: Maximum F0 value to consider. + p_len: Desired length of the F0 output. + hop_length: Hop length for the Crepe model. + model: Crepe model size to use ("full" or "tiny"). + """ + x = x.astype(np.float32) + x /= np.quantile(np.abs(x), 0.999) + audio = torch.from_numpy(x).to(self.device, copy=True) + audio = torch.unsqueeze(audio, dim=0) + if audio.ndim == 2 and audio.shape[0] > 1: + audio = torch.mean(audio, dim=0, keepdim=True).detach() + audio = audio.detach() + pitch: Tensor = torchcrepe.predict( + audio, + self.sample_rate, + hop_length, + f0_min, + f0_max, + model, + batch_size=hop_length * 2, + device=self.device, + pad=True, + ) + p_len = p_len or x.shape[0] // hop_length + source = np.array(pitch.squeeze(0).cpu().float().numpy()) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * p_len, len(source)) / p_len, + np.arange(0, len(source)), + source, + ) + f0 = np.nan_to_num(target) + return f0 + + def get_f0_hybrid( + self, + methods_str, + x, + f0_min, + f0_max, + p_len, + hop_length, + ): + """ + Estimates the fundamental frequency (F0) using a hybrid approach combining multiple methods. + + Args: + methods_str: A string specifying the methods to combine (e.g., "hybrid[crepe+rmvpe]"). + x: The input audio signal as a NumPy array. + f0_min: Minimum F0 value to consider. + f0_max: Maximum F0 value to consider. + p_len: Desired length of the F0 output. + hop_length: Hop length for F0 estimation methods. + """ + methods_str = re.search("hybrid\[(.+)\]", methods_str) + if methods_str: + methods = [method.strip() for method in methods_str.group(1).split("+")] + f0_computation_stack = [] + print(f"Calculating f0 pitch estimations for methods: {', '.join(methods)}") + x = x.astype(np.float32) + x /= np.quantile(np.abs(x), 0.999) + for method in methods: + f0 = None + if method == "crepe": + f0 = self.get_f0_crepe_computation( + x, f0_min, f0_max, p_len, int(hop_length) + ) + elif method == "rmvpe": + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + f0 = f0[1:] + elif method == "fcpe": + self.model_fcpe = FCPEF0Predictor( + os.path.join("rvc", "models", "predictors", "fcpe.pt"), + f0_min=int(f0_min), + f0_max=int(f0_max), + dtype=torch.float32, + device=self.device, + sample_rate=self.sample_rate, + threshold=0.03, + ) + f0 = self.model_fcpe.compute_f0(x, p_len=p_len) + del self.model_fcpe + gc.collect() + f0_computation_stack.append(f0) + + f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None] + f0_median_hybrid = None + if len(f0_computation_stack) == 1: + f0_median_hybrid = f0_computation_stack[0] + else: + f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0) + return f0_median_hybrid + + def get_f0( + self, + input_audio_path, + x, + p_len, + pitch, + f0_method, + filter_radius, + hop_length, + f0_autotune, + f0_autotune_strength, + inp_f0=None, + ): + """ + Estimates the fundamental frequency (F0) of a given audio signal using various methods. + + Args: + input_audio_path: Path to the input audio file. + x: The input audio signal as a NumPy array. + p_len: Desired length of the F0 output. + pitch: Key to adjust the pitch of the F0 contour. + f0_method: Method to use for F0 estimation (e.g., "crepe"). + filter_radius: Radius for median filtering the F0 contour. + hop_length: Hop length for F0 estimation methods. + f0_autotune: Whether to apply autotune to the F0 contour. + inp_f0: Optional input F0 contour to use instead of estimating. + """ + global input_audio_path2wav + if f0_method == "crepe": + f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length)) + elif f0_method == "crepe-tiny": + f0 = self.get_f0_crepe( + x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny" + ) + elif f0_method == "rmvpe": + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + elif f0_method == "fcpe": + self.model_fcpe = FCPEF0Predictor( + os.path.join("rvc", "models", "predictors", "fcpe.pt"), + f0_min=int(self.f0_min), + f0_max=int(self.f0_max), + dtype=torch.float32, + device=self.device, + sample_rate=self.sample_rate, + threshold=0.03, + ) + f0 = self.model_fcpe.compute_f0(x, p_len=p_len) + del self.model_fcpe + gc.collect() + elif "hybrid" in f0_method: + input_audio_path2wav[input_audio_path] = x.astype(np.double) + f0 = self.get_f0_hybrid( + f0_method, + x, + self.f0_min, + self.f0_max, + p_len, + hop_length, + ) + + if f0_autotune is True: + f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength) + + f0 *= pow(2, pitch / 12) + tf0 = self.sample_rate // self.window + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( + self.f0_mel_max - self.f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(int) + + return f0_coarse, f0bak + + def voice_conversion( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + index, + big_npy, + index_rate, + version, + protect, + ): + """ + Performs voice conversion on a given audio segment. + + Args: + model: The feature extractor model. + net_g: The generative model for synthesizing speech. + sid: Speaker ID for the target voice. + audio0: The input audio segment. + pitch: Quantized F0 contour for pitch guidance. + pitchf: Original F0 contour for pitch guidance. + index: FAISS index for speaker embedding retrieval. + big_npy: Speaker embeddings stored in a NumPy array. + index_rate: Blending rate for speaker embedding retrieval. + version: Model version ("v1" or "v2"). + protect: Protection level for preserving the original pitch. + """ + with torch.no_grad(): + pitch_guidance = pitch != None and pitchf != None + # prepare source audio + feats = ( + torch.from_numpy(audio0).half() + if self.is_half + else torch.from_numpy(audio0).float() + ) + feats = feats.mean(-1) if feats.dim() == 2 else feats + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1).to(self.device) + # extract features + feats = model(feats)["last_hidden_state"] + feats = ( + model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats + ) + # make a copy for pitch guidance and protection + feats0 = feats.clone() if pitch_guidance else None + if ( + index + ): # set by parent function, only true if index is available, loaded, and index rate > 0 + feats = self._retrieve_speaker_embeddings( + feats, index, big_npy, index_rate + ) + # feature upsampling + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + # adjust the length if the audio is short + p_len = min(audio0.shape[0] // self.window, feats.shape[1]) + if pitch_guidance: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len] + # Pitch protection blending + if protect < 0.5: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + feats = feats * pitchff.unsqueeze(-1) + feats0 * ( + 1 - pitchff.unsqueeze(-1) + ) + feats = feats.to(feats0.dtype) + else: + pitch, pitchf = None, None + p_len = torch.tensor([p_len], device=self.device).long() + audio1 = ( + (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) + .data.cpu() + .float() + .numpy() + ) + # clean up + del feats, feats0, p_len + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio1 + + def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate): + npy = feats[0].cpu().numpy() + npy = npy.astype("float32") if self.is_half else npy + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + npy = npy.astype("float16") if self.is_half else npy + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + return feats + + def pipeline( + self, + model, + net_g, + sid, + audio, + pitch, + f0_method, + file_index, + index_rate, + pitch_guidance, + filter_radius, + volume_envelope, + version, + protect, + hop_length, + f0_autotune, + f0_autotune_strength, + f0_file, + ): + """ + The main pipeline function for performing voice conversion. + + Args: + model: The feature extractor model. + net_g: The generative model for synthesizing speech. + sid: Speaker ID for the target voice. + audio: The input audio signal. + input_audio_path: Path to the input audio file. + pitch: Key to adjust the pitch of the F0 contour. + f0_method: Method to use for F0 estimation. + file_index: Path to the FAISS index file for speaker embedding retrieval. + index_rate: Blending rate for speaker embedding retrieval. + pitch_guidance: Whether to use pitch guidance during voice conversion. + filter_radius: Radius for median filtering the F0 contour. + tgt_sr: Target sampling rate for the output audio. + resample_sr: Resampling rate for the output audio. + volume_envelope: Blending rate for adjusting the RMS level of the output audio. + version: Model version. + protect: Protection level for preserving the original pitch. + hop_length: Hop length for F0 estimation methods. + f0_autotune: Whether to apply autotune to the F0 contour. + f0_file: Path to a file containing an F0 contour to use. + """ + if file_index != "" and os.path.exists(file_index) and index_rate > 0: + try: + index = faiss.read_index(file_index) + big_npy = index.reconstruct_n(0, index.ntotal) + except Exception as error: + print(f"An error occurred reading the FAISS index: {error}") + index = big_npy = None + else: + index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i : i - self.window] + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() + )[0][0] + ) + s = 0 + audio_opt = [] + t = None + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name"): + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except Exception as error: + print(f"An error occurred reading the F0 file: {error}") + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + if pitch_guidance: + pitch, pitchf = self.get_f0( + "input_audio_path", # questionable purpose of making a key for an array + audio_pad, + p_len, + pitch, + f0_method, + filter_radius, + hop_length, + f0_autotune, + f0_autotune_strength, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + for t in opt_ts: + t = t // self.window * self.window + if pitch_guidance: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if pitch_guidance: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + if volume_envelope != 1: + audio_opt = AudioProcessor.change_rms( + audio, self.sample_rate, audio_opt, self.sample_rate, volume_envelope + ) + # if resample_sr >= self.sample_rate and tgt_sr != resample_sr: + # audio_opt = librosa.resample( + # audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + # ) + # audio_max = np.abs(audio_opt).max() / 0.99 + # max_int16 = 32768 + # if audio_max > 1: + # max_int16 /= audio_max + # audio_opt = (audio_opt * 32768).astype(np.int16) + audio_max = np.abs(audio_opt).max() / 0.99 + if audio_max > 1: + audio_opt /= audio_max + if pitch_guidance: + del pitch, pitchf + del sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt diff --git a/assets/fcpe/.gitkeep b/rvc/lib/algorithm/__init__.py similarity index 100% rename from assets/fcpe/.gitkeep rename to rvc/lib/algorithm/__init__.py diff --git a/rvc/lib/algorithm/attentions.py b/rvc/lib/algorithm/attentions.py new file mode 100644 index 00000000..37367ada --- /dev/null +++ b/rvc/lib/algorithm/attentions.py @@ -0,0 +1,243 @@ +import math +import torch +from rvc.lib.algorithm.commons import convert_pad_shape + + +class MultiHeadAttention(torch.nn.Module): + """ + Multi-head attention module with optional relative positional encoding and proximal bias. + + Args: + channels (int): Number of input channels. + out_channels (int): Number of output channels. + n_heads (int): Number of attention heads. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + window_size (int, optional): Window size for relative positional encoding. Defaults to None. + heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True. + block_length (int, optional): Block length for local attention. Defaults to None. + proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False. + proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False. + """ + + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert ( + channels % n_heads == 0 + ), "Channels must be divisible by the number of heads." + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.k_channels = channels // n_heads + self.window_size = window_size + self.block_length = block_length + self.proximal_bias = proximal_bias + + # Define projections + self.conv_q = torch.nn.Conv1d(channels, channels, 1) + self.conv_k = torch.nn.Conv1d(channels, channels, 1) + self.conv_v = torch.nn.Conv1d(channels, channels, 1) + self.conv_o = torch.nn.Conv1d(channels, out_channels, 1) + + self.drop = torch.nn.Dropout(p_dropout) + + # Relative positional encodings + if window_size: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = torch.nn.Parameter( + torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = torch.nn.Parameter( + torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels) + * rel_stddev + ) + + # Initialize weights + torch.nn.init.xavier_uniform_(self.conv_q.weight) + torch.nn.init.xavier_uniform_(self.conv_k.weight) + torch.nn.init.xavier_uniform_(self.conv_v.weight) + torch.nn.init.xavier_uniform_(self.conv_o.weight) + + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + # Compute query, key, value projections + q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c) + + # Compute attention + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + # Final output projection + return self.conv_o(x) + + def attention(self, query, key, value, mask=None): + # Reshape and compute scaled dot-product attention + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + + if self.window_size: + assert t_s == t_t, "Relative attention only supports self-attention." + scores += self._compute_relative_scores(query, t_s) + + if self.proximal_bias: + assert t_s == t_t, "Proximal bias only supports self-attention." + scores += self._attention_bias_proximal(t_s).to(scores.device, scores.dtype) + + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length: + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + + # Apply softmax and dropout + p_attn = self.drop(torch.nn.functional.softmax(scores, dim=-1)) + + # Compute attention output + output = torch.matmul(p_attn, value) + + if self.window_size: + output += self._apply_relative_values(p_attn, t_s) + + return output.transpose(2, 3).contiguous().view(b, d, t_t), p_attn + + def _compute_relative_scores(self, query, length): + rel_emb = self._get_relative_embeddings(self.emb_rel_k, length) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), rel_emb + ) + return self._relative_position_to_absolute_position(rel_logits) + + def _apply_relative_values(self, p_attn, length): + rel_weights = self._absolute_position_to_relative_position(p_attn) + rel_emb = self._get_relative_embeddings(self.emb_rel_v, length) + return self._matmul_with_relative_values(rel_weights, rel_emb) + + # Helper methods + def _matmul_with_relative_values(self, x, y): + return torch.matmul(x, y.unsqueeze(0)) + + def _matmul_with_relative_keys(self, x, y): + return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + + def _get_relative_embeddings(self, embeddings, length): + pad_length = max(length - (self.window_size + 1), 0) + start = max((self.window_size + 1) - length, 0) + end = start + 2 * length - 1 + + if pad_length > 0: + embeddings = torch.nn.functional.pad( + embeddings, + convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + return embeddings[:, start:end] + + def _relative_position_to_absolute_position(self, x): + batch, heads, length, _ = x.size() + x = torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]) + ) + x_flat = x.view(batch, heads, length * 2 * length) + x_flat = torch.nn.functional.pad( + x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + return x_flat.view(batch, heads, length + 1, 2 * length - 1)[ + :, :, :length, length - 1 : + ] + + def _absolute_position_to_relative_position(self, x): + batch, heads, length, _ = x.size() + x = torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view(batch, heads, length**2 + length * (length - 1)) + x_flat = torch.nn.functional.pad( + x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]) + ) + return x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:] + + def _attention_bias_proximal(self, length): + r = torch.arange(length, dtype=torch.float32) + diff = r.unsqueeze(0) - r.unsqueeze(1) + return -torch.log1p(torch.abs(diff)).unsqueeze(0).unsqueeze(0) + + +class FFN(torch.nn.Module): + """ + Feed-forward network module. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + filter_channels (int): Number of filter channels in the convolution layers. + kernel_size (int): Kernel size of the convolution layers. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + activation (str, optional): Activation function to use. Defaults to None. + causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False. + """ + + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.padding_fn = self._causal_padding if causal else self._same_padding + + self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = torch.nn.Dropout(p_dropout) + + self.activation = activation + + def forward(self, x, x_mask): + x = self.conv_1(self.padding_fn(x * x_mask)) + x = self._apply_activation(x) + x = self.drop(x) + x = self.conv_2(self.padding_fn(x * x_mask)) + return x * x_mask + + def _apply_activation(self, x): + if self.activation == "gelu": + return x * torch.sigmoid(1.702 * x) + return torch.relu(x) + + def _causal_padding(self, x): + pad_l, pad_r = self.conv_1.kernel_size[0] - 1, 0 + return torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [pad_l, pad_r]]) + ) + + def _same_padding(self, x): + pad = (self.conv_1.kernel_size[0] - 1) // 2 + return torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [pad, pad]]) + ) diff --git a/rvc/lib/algorithm/commons.py b/rvc/lib/algorithm/commons.py new file mode 100644 index 00000000..2524abc4 --- /dev/null +++ b/rvc/lib/algorithm/commons.py @@ -0,0 +1,207 @@ +import math +import torch +from typing import List, Optional + + +def init_weights(m, mean=0.0, std=0.01): + """ + Initialize the weights of a module. + + Args: + m: The module to initialize. + mean: The mean of the normal distribution. + std: The standard deviation of the normal distribution. + """ + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + """ + Calculate the padding needed for a convolution. + + Args: + kernel_size: The size of the kernel. + dilation: The dilation of the convolution. + """ + return int((kernel_size * dilation - dilation) / 2) + + +def convert_pad_shape(pad_shape): + """ + Convert the pad shape to a list of integers. + + Args: + pad_shape: The pad shape.. + """ + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """ + Calculate the KL divergence between two distributions. + + Args: + m_p: The mean of the first distribution. + logs_p: The log of the standard deviation of the first distribution. + m_q: The mean of the second distribution. + logs_q: The log of the standard deviation of the second distribution. + """ + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl + + +def slice_segments( + x: torch.Tensor, ids_str: torch.Tensor, segment_size: int = 4, dim: int = 2 +): + """ + Slice segments from a tensor, handling tensors with different numbers of dimensions. + + Args: + x (torch.Tensor): The tensor to slice. + ids_str (torch.Tensor): The starting indices of the segments. + segment_size (int, optional): The size of each segment. Defaults to 4. + dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2. + """ + if dim == 2: + ret = torch.zeros_like(x[:, :segment_size]) + elif dim == 3: + ret = torch.zeros_like(x[:, :, :segment_size]) + + for i in range(x.size(0)): + idx_str = ids_str[i].item() + idx_end = idx_str + segment_size + if dim == 2: + ret[i] = x[i, idx_str:idx_end] + else: + ret[i] = x[i, :, idx_str:idx_end] + + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + """ + Randomly slice segments from a tensor. + + Args: + x: The tensor to slice. + x_lengths: The lengths of the sequences. + segment_size: The size of each segment. + """ + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size, dim=3) + return ret, ids_str + + +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + """ + Generate a 1D timing signal. + + Args: + length: The length of the signal. + channels: The number of channels of the signal. + min_timescale: The minimum timescale. + max_timescale: The maximum timescale. + """ + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = torch.nn.functional.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def subsequent_mask(length): + """ + Generate a subsequent mask. + + Args: + length: The length of the sequence. + """ + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + """ + Fused add tanh sigmoid multiply operation. + + Args: + input_a: The first input tensor. + input_b: The second input tensor. + n_channels: The number of channels. + """ + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]: + """ + Convert the pad shape to a list of integers. + + Args: + pad_shape: The pad shape. + """ + return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist() + + +def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None): + """ + Generate a sequence mask. + + Args: + length: The lengths of the sequences. + max_length: The maximum length of the sequences. + """ + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def clip_grad_value(parameters, clip_value, norm_type=2): + """ + Clip the gradients of a list of parameters. + + Args: + parameters: The list of parameters to clip. + clip_value: The maximum value of the gradients. + norm_type: The type of norm to use for clipping. + """ + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm diff --git a/rvc/lib/algorithm/discriminators.py b/rvc/lib/algorithm/discriminators.py new file mode 100644 index 00000000..99251ad6 --- /dev/null +++ b/rvc/lib/algorithm/discriminators.py @@ -0,0 +1,160 @@ +import torch +from torch.nn.utils.parametrizations import spectral_norm, weight_norm + +from rvc.lib.algorithm.commons import get_padding +from rvc.lib.algorithm.residuals import LRELU_SLOPE + + +class MultiPeriodDiscriminator(torch.nn.Module): + """ + Multi-period discriminator. + + This class implements a multi-period discriminator, which is used to + discriminate between real and fake audio signals. The discriminator + is composed of a series of convolutional layers that are applied to + the input signal at different periods. + + Args: + periods (str): Periods of the discriminator. V1 = [2, 3, 5, 7, 11, 17], V2 = [2, 3, 5, 7, 11, 17, 23, 37]. + use_spectral_norm (bool): Whether to use spectral normalization. + Defaults to False. + """ + + def __init__(self, version, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = ( + [2, 3, 5, 7, 11, 17] if version == "v1" else [2, 3, 5, 7, 11, 17, 23, 37] + ) + self.discriminators = torch.nn.ModuleList( + [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods] + ) + + def forward(self, y, y_hat): + """ + Forward pass of the multi-period discriminator. + + Args: + y (torch.Tensor): Real audio signal. + y_hat (torch.Tensor): Fake audio signal. + """ + y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], [] + for d in self.discriminators: + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + """ + Discriminator for the short-term component. + + This class implements a discriminator for the short-term component + of the audio signal. The discriminator is composed of a series of + convolutional layers that are applied to the input signal. + """ + + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = spectral_norm if use_spectral_norm else weight_norm + self.convs = torch.nn.ModuleList( + [ + norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)), + norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1)) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + + def forward(self, x): + """ + Forward pass of the discriminator. + + Args: + x (torch.Tensor): Input audio signal. + """ + fmap = [] + for conv in self.convs: + x = self.lrelu(conv(x)) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + """ + Discriminator for the long-term component. + + This class implements a discriminator for the long-term component + of the audio signal. The discriminator is composed of a series of + convolutional layers that are applied to the input signal at a given + period. + + Args: + period (int): Period of the discriminator. + kernel_size (int): Kernel size of the convolutional layers. + Defaults to 5. + stride (int): Stride of the convolutional layers. Defaults to 3. + use_spectral_norm (bool): Whether to use spectral normalization. + Defaults to False. + """ + + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = spectral_norm if use_spectral_norm else weight_norm + + in_channels = [1, 32, 128, 512, 1024] + out_channels = [32, 128, 512, 1024, 1024] + + self.convs = torch.nn.ModuleList( + [ + norm_f( + torch.nn.Conv2d( + in_ch, + out_ch, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ) + for in_ch, out_ch in zip(in_channels, out_channels) + ] + ) + + self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + + def forward(self, x): + """ + Forward pass of the discriminator. + + Args: + x (torch.Tensor): Input audio signal. + """ + fmap = [] + b, c, t = x.shape + if t % self.period != 0: + n_pad = self.period - (t % self.period) + x = torch.nn.functional.pad(x, (0, n_pad), "reflect") + x = x.view(b, c, -1, self.period) + + for conv in self.convs: + x = self.lrelu(conv(x)) + fmap.append(x) + + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + return x, fmap diff --git a/rvc/lib/algorithm/encoders.py b/rvc/lib/algorithm/encoders.py new file mode 100644 index 00000000..e52f9e7d --- /dev/null +++ b/rvc/lib/algorithm/encoders.py @@ -0,0 +1,218 @@ +import math +import torch +from typing import Optional + +from rvc.lib.algorithm.commons import sequence_mask +from rvc.lib.algorithm.modules import WaveNet +from rvc.lib.algorithm.normalization import LayerNorm +from rvc.lib.algorithm.attentions import FFN, MultiHeadAttention + + +class Encoder(torch.nn.Module): + """ + Encoder module for the Transformer model. + + Args: + hidden_channels (int): Number of hidden channels in the encoder. + filter_channels (int): Number of filter channels in the feed-forward network. + n_heads (int): Number of attention heads. + n_layers (int): Number of encoder layers. + kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + window_size (int, optional): Window size for relative positional encoding. Defaults to 10. + """ + + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=10, + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = torch.nn.Dropout(p_dropout) + self.attn_layers = torch.nn.ModuleList() + self.norm_layers_1 = torch.nn.ModuleList() + self.ffn_layers = torch.nn.ModuleList() + self.norm_layers_2 = torch.nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class TextEncoder(torch.nn.Module): + """Text Encoder with configurable embedding dimension. + + Args: + out_channels (int): Output channels of the encoder. + hidden_channels (int): Hidden channels of the encoder. + filter_channels (int): Filter channels of the encoder. + n_heads (int): Number of attention heads. + n_layers (int): Number of encoder layers. + kernel_size (int): Kernel size of the convolutional layers. + p_dropout (float): Dropout probability. + embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768). + f0 (bool, optional): Whether to use F0 embedding. Defaults to True. + """ + + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + embedding_dim, + f0=True, + ): + super(TextEncoder, self).__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels) + self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True) + if f0: + self.emb_pitch = torch.nn.Embedding(256, hidden_channels) + self.encoder = Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor + ): + if pitch is None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class PosteriorEncoder(torch.nn.Module): + """Posterior Encoder for inferring latent representation. + + Args: + in_channels (int): Number of channels in the input. + out_channels (int): Number of channels in the output. + hidden_channels (int): Number of hidden channels in the encoder. + kernel_size (int): Kernel size of the convolutional layers. + dilation_rate (int): Dilation rate of the convolutional layers. + n_layers (int): Number of layers in the encoder. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + """ + + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super(PosteriorEncoder, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = WaveNet( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None + ): + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + """Removes weight normalization from the encoder.""" + self.enc.remove_weight_norm() + + def __prepare_scriptable__(self): + """Prepares the module for scripting.""" + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self diff --git a/rvc/lib/algorithm/generators.py b/rvc/lib/algorithm/generators.py new file mode 100644 index 00000000..ccc2358d --- /dev/null +++ b/rvc/lib/algorithm/generators.py @@ -0,0 +1,231 @@ +import torch +import numpy as np +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from typing import Optional + +from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock1, ResBlock2 +from rvc.lib.algorithm.commons import init_weights + + +class Generator(torch.nn.Module): + """Generator for synthesizing audio. + + Args: + initial_channel (int): Number of channels in the initial convolutional layer. + resblock (str): Type of residual block to use (1 or 2). + resblock_kernel_sizes (list): Kernel sizes of the residual blocks. + resblock_dilation_sizes (list): Dilation rates of the residual blocks. + upsample_rates (list): Upsampling rates. + upsample_initial_channel (int): Number of channels in the initial upsampling layer. + upsample_kernel_sizes (list): Kernel sizes of the upsampling layers. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + """ + + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = torch.nn.Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = ResBlock1 if resblock == "1" else ResBlock2 + + self.ups = torch.nn.ModuleList() + self.resblocks = torch.nn.ModuleList() + + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs == None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + x = torch.nn.functional.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def __prepare_scriptable__(self): + """Prepares the module for scripting.""" + for l in self.ups_and_resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + def remove_weight_norm(self): + """Removes weight normalization from the upsampling and residual blocks.""" + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGenerator(torch.nn.Module): + """ + A sine wave generator that synthesizes waveforms with optional harmonic overtones and noise. + + Args: + sampling_rate (int): The sampling rate in Hz. + num_harmonics (int, optional): The number of harmonic overtones to include. Defaults to 0. + sine_amplitude (float, optional): The amplitude of the sine waveform. Defaults to 0.1. + noise_stddev (float, optional): The standard deviation of Gaussian noise. Defaults to 0.003. + voiced_threshold (float, optional): F0 threshold for distinguishing voiced/unvoiced frames. Defaults to 0. + """ + + def __init__( + self, + sampling_rate: int, + num_harmonics: int = 0, + sine_amplitude: float = 0.1, + noise_stddev: float = 0.003, + voiced_threshold: float = 0.0, + ): + super(SineGenerator, self).__init__() + self.sampling_rate = sampling_rate + self.num_harmonics = num_harmonics + self.sine_amplitude = sine_amplitude + self.noise_stddev = noise_stddev + self.voiced_threshold = voiced_threshold + self.waveform_dim = self.num_harmonics + 1 # fundamental + harmonics + + def _compute_voiced_unvoiced(self, f0: torch.Tensor) -> torch.Tensor: + """ + Generate a binary mask to indicate voiced/unvoiced frames. + + Args: + f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length). + """ + uv_mask = (f0 > self.voiced_threshold).float() + return uv_mask + + def _generate_sine_wave( + self, f0: torch.Tensor, upsampling_factor: int + ) -> torch.Tensor: + """ + Generate sine waves for the fundamental frequency and its harmonics. + + Args: + f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1). + upsampling_factor (int): Upsampling factor. + """ + batch_size, length, _ = f0.shape + + # Create an upsampling grid + upsampling_grid = torch.arange( + 1, upsampling_factor + 1, dtype=f0.dtype, device=f0.device + ) + + # Calculate phase increments + phase_increments = (f0 / self.sampling_rate) * upsampling_grid + phase_remainder = torch.fmod(phase_increments[:, :-1, -1:] + 0.5, 1.0) - 0.5 + cumulative_phase = phase_remainder.cumsum(dim=1).fmod(1.0).to(f0.dtype) + phase_increments += torch.nn.functional.pad( + cumulative_phase, (0, 0, 1, 0), mode="constant" + ) + + # Reshape to match the sine wave shape + phase_increments = phase_increments.reshape(batch_size, -1, 1) + + # Scale for harmonics + harmonic_scale = torch.arange( + 1, self.waveform_dim + 1, dtype=f0.dtype, device=f0.device + ).reshape(1, 1, -1) + phase_increments *= harmonic_scale + + # Add random phase offset (except for the fundamental) + random_phase = torch.rand(1, 1, self.waveform_dim, device=f0.device) + random_phase[..., 0] = 0 # Fundamental frequency has no random offset + phase_increments += random_phase + + # Generate sine waves + sine_waves = torch.sin(2 * np.pi * phase_increments) + return sine_waves + + def forward(self, f0: torch.Tensor, upsampling_factor: int): + """ + Forward pass to generate sine waveforms with noise and voiced/unvoiced masking. + + Args: + f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1). + upsampling_factor (int): Upsampling factor. + """ + with torch.no_grad(): + # Expand `f0` to include waveform dimensions + f0 = f0.unsqueeze(-1) + + # Generate sine waves + sine_waves = ( + self._generate_sine_wave(f0, upsampling_factor) * self.sine_amplitude + ) + + # Compute voiced/unvoiced mask + voiced_mask = self._compute_voiced_unvoiced(f0) + + # Upsample voiced/unvoiced mask + voiced_mask = torch.nn.functional.interpolate( + voiced_mask.transpose(2, 1), + scale_factor=float(upsampling_factor), + mode="nearest", + ).transpose(2, 1) + + # Compute noise amplitude + noise_amplitude = voiced_mask * self.noise_stddev + (1 - voiced_mask) * ( + self.sine_amplitude / 3 + ) + + # Add Gaussian noise + noise = noise_amplitude * torch.randn_like(sine_waves) + + # Combine sine waves and noise + sine_waveforms = sine_waves * voiced_mask + noise + + return sine_waveforms, voiced_mask, noise diff --git a/rvc/lib/algorithm/modules.py b/rvc/lib/algorithm/modules.py new file mode 100644 index 00000000..8a2dad1a --- /dev/null +++ b/rvc/lib/algorithm/modules.py @@ -0,0 +1,124 @@ +import torch +from rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply + + +class WaveNet(torch.nn.Module): + """WaveNet residual blocks as used in WaveGlow. + + Args: + hidden_channels (int): Number of hidden channels. + kernel_size (int): Size of the convolutional kernel. + dilation_rate (int): Dilation rate of the convolution. + n_layers (int): Number of convolutional layers. + gin_channels (int, optional): Number of conditioning channels. Defaults to 0. + p_dropout (float, optional): Dropout probability. Defaults to 0. + """ + + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super().__init__() + assert kernel_size % 2 == 1, "Kernel size must be odd for proper padding." + + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + self.n_channels_tensor = torch.IntTensor([hidden_channels]) # Static tensor + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = torch.nn.Dropout(p_dropout) + + # Conditional layer for global conditioning + if gin_channels: + self.cond_layer = torch.nn.utils.parametrizations.weight_norm( + torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1), + name="weight", + ) + + # Precompute dilations and paddings + dilations = [dilation_rate**i for i in range(n_layers)] + paddings = [(kernel_size * d - d) // 2 for d in dilations] + + # Initialize layers + for i in range(n_layers): + self.in_layers.append( + torch.nn.utils.parametrizations.weight_norm( + torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilations[i], + padding=paddings[i], + ), + name="weight", + ) + ) + + res_skip_channels = ( + hidden_channels if i == n_layers - 1 else 2 * hidden_channels + ) + self.res_skip_layers.append( + torch.nn.utils.parametrizations.weight_norm( + torch.nn.Conv1d(hidden_channels, res_skip_channels, 1), + name="weight", + ) + ) + + def forward(self, x, x_mask, g=None): + """Forward pass. + + Args: + x (torch.Tensor): Input tensor (batch_size, hidden_channels, time_steps). + x_mask (torch.Tensor): Mask tensor (batch_size, 1, time_steps). + g (torch.Tensor, optional): Conditioning tensor (batch_size, gin_channels, time_steps). + """ + output = x.clone().zero_() + + # Apply conditional layer if global conditioning is provided + g = self.cond_layer(g) if g is not None else None + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + g_l = ( + g[ + :, + i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels, + :, + ] + if g is not None + else 0 + ) + + # Activation with fused Tanh-Sigmoid + acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.n_channels_tensor) + acts = self.drop(acts) + + # Residual and skip connections + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + + return output * x_mask + + def remove_weight_norm(self): + """Remove weight normalization from the module.""" + if self.gin_channels: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for layer in self.in_layers: + torch.nn.utils.remove_weight_norm(layer) + for layer in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(layer) diff --git a/rvc/lib/algorithm/normalization.py b/rvc/lib/algorithm/normalization.py new file mode 100644 index 00000000..878ec09d --- /dev/null +++ b/rvc/lib/algorithm/normalization.py @@ -0,0 +1,31 @@ +import torch + + +class LayerNorm(torch.nn.Module): + """Layer normalization module. + + Args: + channels (int): Number of channels. + eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5. + """ + + def __init__(self, channels, eps=1e-5): + super().__init__() + self.eps = eps + self.gamma = torch.nn.Parameter(torch.ones(channels)) + self.beta = torch.nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + """Forward pass. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps). + + """ + # Transpose to (batch_size, time_steps, channels) for layer_norm + x = x.transpose(1, -1) + x = torch.nn.functional.layer_norm( + x, (x.size(-1),), self.gamma, self.beta, self.eps + ) + # Transpose back to (batch_size, channels, time_steps) + return x.transpose(1, -1) diff --git a/rvc/lib/algorithm/nsf.py b/rvc/lib/algorithm/nsf.py new file mode 100644 index 00000000..5476adab --- /dev/null +++ b/rvc/lib/algorithm/nsf.py @@ -0,0 +1,196 @@ +import math +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from typing import Optional + +from rvc.lib.algorithm.generators import SineGenerator +from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock1, ResBlock2 +from rvc.lib.algorithm.commons import init_weights + + +class SourceModuleHnNSF(torch.nn.Module): + """ + Source Module for harmonic-plus-noise excitation. + + Args: + sample_rate (int): Sampling rate in Hz. + harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0. + sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003. + voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0. + is_half (bool, optional): Whether to use half precision. Defaults to True. + """ + + def __init__( + self, + sample_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + + self.l_sin_gen = SineGenerator( + sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x: torch.Tensor, upsample_factor: int = 1): + sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor) + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None + + +class GeneratorNSF(torch.nn.Module): + """ + Generator for synthesizing audio using the NSF (Neural Source Filter) approach. + + Args: + initial_channel (int): Number of channels in the initial convolutional layer. + resblock (str): Type of residual block to use (1 or 2). + resblock_kernel_sizes (list): Kernel sizes of the residual blocks. + resblock_dilation_sizes (list): Dilation rates of the residual blocks. + upsample_rates (list): Upsampling rates. + upsample_initial_channel (int): Number of channels in the initial upsampling layer. + upsample_kernel_sizes (list): Kernel sizes of the upsampling layers. + gin_channels (int): Number of channels for the global conditioning input. + sr (int): Sampling rate. + is_half (bool, optional): Whether to use half precision. Defaults to False. + """ + + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sample_rate=sr, harmonic_num=0, is_half=is_half + ) + + self.conv_pre = torch.nn.Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock_cls = ResBlock1 if resblock == "1" else ResBlock2 + + self.ups = torch.nn.ModuleList() + self.noise_convs = torch.nn.ModuleList() + + channels = [ + upsample_initial_channel // (2 ** (i + 1)) + for i in range(len(upsample_rates)) + ] + stride_f0s = [ + math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 + for i in range(len(upsample_rates)) + ] + + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + channels[i], + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.noise_convs.append( + torch.nn.Conv1d( + 1, + channels[i], + kernel_size=(stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1), + stride=stride_f0s[i], + padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0), + ) + ) + + self.resblocks = torch.nn.ModuleList( + [ + resblock_cls(channels[i], k, d) + for i in range(len(self.ups)) + for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes) + ] + ) + + self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = math.prod(upsample_rates) + self.lrelu_slope = LRELU_SLOPE + + def forward(self, x, f0, g: Optional[torch.Tensor] = None): + har_source, _, _ = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + + if g is not None: + x = x + self.cond(g) + + for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): + x = torch.nn.functional.leaky_relu(x, self.lrelu_slope) + x = ups(x) + x = x + noise_convs(har_source) + + xs = sum( + [ + resblock(x) + for j, resblock in enumerate(self.resblocks) + if j in range(i * self.num_kernels, (i + 1) * self.num_kernels) + ] + ) + x = xs / self.num_kernels + + x = torch.nn.functional.leaky_relu(x) + x = torch.tanh(self.conv_post(x)) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + remove_weight_norm(l) + for l in self.resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + remove_weight_norm(l) + return self diff --git a/rvc/lib/algorithm/residuals.py b/rvc/lib/algorithm/residuals.py new file mode 100644 index 00000000..87805f72 --- /dev/null +++ b/rvc/lib/algorithm/residuals.py @@ -0,0 +1,250 @@ +from typing import Optional +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm + +from rvc.lib.algorithm.modules import WaveNet +from rvc.lib.algorithm.commons import get_padding, init_weights + +LRELU_SLOPE = 0.1 + + +def create_conv1d_layer(channels, kernel_size, dilation): + return weight_norm( + torch.nn.Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation, + padding=get_padding(kernel_size, dilation), + ) + ) + + +def apply_mask(tensor, mask): + return tensor * mask if mask is not None else tensor + + +class ResBlockBase(torch.nn.Module): + def __init__(self, channels, kernel_size, dilations): + super(ResBlockBase, self).__init__() + self.convs1 = torch.nn.ModuleList( + [create_conv1d_layer(channels, kernel_size, d) for d in dilations] + ) + self.convs1.apply(init_weights) + + self.convs2 = torch.nn.ModuleList( + [create_conv1d_layer(channels, kernel_size, 1) for _ in dilations] + ) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + xt = apply_mask(xt, x_mask) + xt = torch.nn.functional.leaky_relu(c1(xt), LRELU_SLOPE) + xt = apply_mask(xt, x_mask) + xt = c2(xt) + x = xt + x + return apply_mask(x, x_mask) + + def remove_weight_norm(self): + for conv in self.convs1 + self.convs2: + remove_weight_norm(conv) + + +class ResBlock1(ResBlockBase): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__(channels, kernel_size, dilation) + + +class ResBlock2(ResBlockBase): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__(channels, kernel_size, dilation) + + +class Flip(torch.nn.Module): + """Flip module for flow-based models. + + This module flips the input along the time dimension. + """ + + def forward(self, x, *args, reverse=False, **kwargs): + """Forward pass. + + Args: + x (torch.Tensor): Input tensor. + reverse (bool, optional): Whether to reverse the operation. Defaults to False. + """ + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ResidualCouplingBlock(torch.nn.Module): + """Residual Coupling Block for normalizing flow. + + Args: + channels (int): Number of channels in the input. + hidden_channels (int): Number of hidden channels in the coupling layer. + kernel_size (int): Kernel size of the convolutional layers. + dilation_rate (int): Dilation rate of the convolutional layers. + n_layers (int): Number of layers in the coupling layer. + n_flows (int, optional): Number of coupling layers in the block. Defaults to 4. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + """ + + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super(ResidualCouplingBlock, self).__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = torch.nn.ModuleList() + for i in range(n_flows): + self.flows.append( + ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(Flip()) + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow.forward(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + """Removes weight normalization from the coupling layers.""" + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + def __prepare_scriptable__(self): + """Prepares the module for scripting.""" + for i in range(self.n_flows): + for hook in self.flows[i * 2]._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flows[i * 2]) + + return self + + +class ResidualCouplingLayer(torch.nn.Module): + """Residual coupling layer for flow-based models. + + Args: + channels (int): Number of channels. + hidden_channels (int): Number of hidden channels. + kernel_size (int): Size of the convolutional kernel. + dilation_rate (int): Dilation rate of the convolution. + n_layers (int): Number of convolutional layers. + p_dropout (float, optional): Dropout probability. Defaults to 0. + gin_channels (int, optional): Number of conditioning channels. Defaults to 0. + mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False. + """ + + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WaveNet( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + gin_channels=gin_channels, + ) + self.post = torch.nn.Conv1d( + hidden_channels, self.half_channels * (2 - mean_only), 1 + ) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + """Forward pass. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps). + x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps). + g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps). + Defaults to None. + reverse (bool, optional): Whether to reverse the operation. Defaults to False. + """ + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + def remove_weight_norm(self): + """Remove weight normalization from the module.""" + self.enc.remove_weight_norm() diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py new file mode 100644 index 00000000..2a1aa236 --- /dev/null +++ b/rvc/lib/algorithm/synthesizers.py @@ -0,0 +1,237 @@ +import torch +from typing import Optional + +from rvc.lib.algorithm.nsf import GeneratorNSF +from rvc.lib.algorithm.generators import Generator +from rvc.lib.algorithm.commons import slice_segments, rand_slice_segments +from rvc.lib.algorithm.residuals import ResidualCouplingBlock +from rvc.lib.algorithm.encoders import TextEncoder, PosteriorEncoder + + +class Synthesizer(torch.nn.Module): + """ + Base Synthesizer model. + + Args: + spec_channels (int): Number of channels in the spectrogram. + segment_size (int): Size of the audio segment. + inter_channels (int): Number of channels in the intermediate layers. + hidden_channels (int): Number of channels in the hidden layers. + filter_channels (int): Number of channels in the filter layers. + n_heads (int): Number of attention heads. + n_layers (int): Number of layers in the encoder. + kernel_size (int): Size of the convolution kernel. + p_dropout (float): Dropout probability. + resblock (str): Type of residual block. + resblock_kernel_sizes (list): Kernel sizes for the residual blocks. + resblock_dilation_sizes (list): Dilation sizes for the residual blocks. + upsample_rates (list): Upsampling rates for the decoder. + upsample_initial_channel (int): Number of channels in the initial upsampling layer. + upsample_kernel_sizes (list): Kernel sizes for the upsampling layers. + spk_embed_dim (int): Dimension of the speaker embedding. + gin_channels (int): Number of channels in the global conditioning vector. + sr (int): Sampling rate of the audio. + use_f0 (bool): Whether to use F0 information. + text_enc_hidden_dim (int): Hidden dimension for the text encoder. + kwargs: Additional keyword arguments. + """ + + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + use_f0, + text_enc_hidden_dim=768, + **kwargs + ): + super(Synthesizer, self).__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.spk_embed_dim = spk_embed_dim + self.use_f0 = use_f0 + + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + text_enc_hidden_dim, + f0=use_f0, + ) + + if use_f0: + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + else: + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = torch.nn.Embedding(self.spk_embed_dim, gin_channels) + + def remove_weight_norm(self): + """Removes weight normalization from the model.""" + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore + def forward( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: Optional[torch.Tensor] = None, + pitchf: Optional[torch.Tensor] = None, + y: torch.Tensor = None, + y_lengths: torch.Tensor = None, + ds: Optional[torch.Tensor] = None, + ): + """ + Forward pass of the model. + + Args: + phone (torch.Tensor): Phoneme sequence. + phone_lengths (torch.Tensor): Lengths of the phoneme sequences. + pitch (torch.Tensor, optional): Pitch sequence. + pitchf (torch.Tensor, optional): Fine-grained pitch sequence. + y (torch.Tensor, optional): Target spectrogram. + y_lengths (torch.Tensor, optional): Lengths of the target spectrograms. + ds (torch.Tensor, optional): Speaker embedding. Defaults to None. + """ + g = self.emb_g(ds).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + if y is not None: + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size) + if self.use_f0: + pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2) + o = self.dec(z_slice, pitchf, g=g) + else: + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + else: + return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: Optional[torch.Tensor] = None, + nsff0: Optional[torch.Tensor] = None, + sid: torch.Tensor = None, + rate: Optional[torch.Tensor] = None, + ): + """ + Inference of the model. + + Args: + phone (torch.Tensor): Phoneme sequence. + phone_lengths (torch.Tensor): Lengths of the phoneme sequences. + pitch (torch.Tensor, optional): Pitch sequence. + nsff0 (torch.Tensor, optional): Fine-grained pitch sequence. + sid (torch.Tensor): Speaker embedding. + rate (torch.Tensor, optional): Rate for time-stretching. Defaults to None. + """ + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate is not None: + assert isinstance(rate, torch.Tensor) + head = int(z_p.shape[2] * (1.0 - rate.item())) + z_p = z_p[:, :, head:] + x_mask = x_mask[:, :, head:] + if self.use_f0: + nsff0 = nsff0[:, head:] + if self.use_f0: + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, nsff0, g=g) + else: + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/rvc/lib/predictors/F0Extractor.py b/rvc/lib/predictors/F0Extractor.py new file mode 100644 index 00000000..bc3b61f3 --- /dev/null +++ b/rvc/lib/predictors/F0Extractor.py @@ -0,0 +1,100 @@ +import dataclasses +import pathlib +import libf0 +import librosa +import numpy as np +import resampy +import torch +import torchcrepe +import torchfcpe +import os + +# from tools.anyf0.rmvpe import RMVPE +from rvc.lib.predictors.RMVPE import RMVPE0Predictor +from rvc.configs.config import Config + +config = Config() + + +@dataclasses.dataclass +class F0Extractor: + wav_path: pathlib.Path + sample_rate: int = 44100 + hop_length: int = 512 + f0_min: int = 50 + f0_max: int = 1600 + method: str = "rmvpe" + x: np.ndarray = dataclasses.field(init=False) + + def __post_init__(self): + self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate) + + @property + def hop_size(self) -> float: + return self.hop_length / self.sample_rate + + @property + def wav16k(self) -> np.ndarray: + return resampy.resample(self.x, self.sample_rate, 16000) + + def extract_f0(self) -> np.ndarray: + f0 = None + method = self.method + if method == "crepe": + wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(config.device) + f0 = torchcrepe.predict( + wav16k_torch, + sample_rate=16000, + hop_length=160, + batch_size=512, + fmin=self.f0_min, + fmax=self.f0_max, + device=config.device, + ) + f0 = f0[0].cpu().numpy() + elif method == "fcpe": + audio = librosa.to_mono(self.x) + audio_length = len(audio) + f0_target_length = (audio_length // self.hop_length) + 1 + audio = ( + torch.from_numpy(audio) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .to(config.device) + ) + model = torchfcpe.spawn_bundled_infer_model(device=config.device) + + f0 = model.infer( + audio, + sr=self.sample_rate, + decoder_mode="local_argmax", + threshold=0.006, + f0_min=self.f0_min, + f0_max=self.f0_max, + interp_uv=False, + output_interp_target_length=f0_target_length, + ) + f0 = f0.squeeze().cpu().numpy() + elif method == "rmvpe": + model_rmvpe = RMVPE0Predictor( + os.path.join("rvc", "models", "predictors", "rmvpe.pt"), + is_half=config.is_half, + device=config.device, + # hop_length=80 + ) + f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03) + + else: + raise ValueError(f"Unknown method: {self.method}") + return libf0.hz_to_cents(f0, librosa.midi_to_hz(0)) + + def plot_f0(self, f0): + from matplotlib import pyplot as plt + + plt.figure(figsize=(10, 4)) + plt.plot(f0) + plt.title(self.method) + plt.xlabel("Time (frames)") + plt.ylabel("F0 (cents)") + plt.show() diff --git a/rvc/lib/predictors/FCPE.py b/rvc/lib/predictors/FCPE.py new file mode 100644 index 00000000..12f6c346 --- /dev/null +++ b/rvc/lib/predictors/FCPE.py @@ -0,0 +1,920 @@ +from typing import Union + +import torch.nn.functional as F +import numpy as np +import torch +import torch.nn as nn +from torch.nn.utils.parametrizations import weight_norm +from torchaudio.transforms import Resample +import os +import librosa +import soundfile as sf +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn +import math +from functools import partial + +from einops import rearrange, repeat +from local_attention import LocalAttention +from torch import nn + +os.environ["LRU_CACHE_CAPACITY"] = "3" + + +def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): + """Loads wav file to torch tensor.""" + try: + data, sample_rate = sf.read(full_path, always_2d=True) + except Exception as error: + print(f"An error occurred loading {full_path}: {error}") + if return_empty_on_exception: + return [], sample_rate or target_sr or 48000 + else: + raise + + data = data[:, 0] if len(data.shape) > 1 else data + assert len(data) > 2 + + # Normalize data + max_mag = ( + -np.iinfo(data.dtype).min + if np.issubdtype(data.dtype, np.integer) + else max(np.amax(data), -np.amin(data)) + ) + max_mag = ( + (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0) + ) + data = torch.FloatTensor(data.astype(np.float32)) / max_mag + + # Handle exceptions and resample + if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception: + return [], sample_rate or target_sr or 48000 + if target_sr is not None and sample_rate != target_sr: + data = torch.from_numpy( + librosa.core.resample( + data.numpy(), orig_sr=sample_rate, target_sr=target_sr + ) + ) + sample_rate = target_sr + + return data, sample_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +class STFT: + def __init__( + self, + sr=22050, + n_mels=80, + n_fft=1024, + win_size=1024, + hop_length=256, + fmin=20, + fmax=11025, + clip_val=1e-5, + ): + self.target_sr = sr + self.n_mels = n_mels + self.n_fft = n_fft + self.win_size = win_size + self.hop_length = hop_length + self.fmin = fmin + self.fmax = fmax + self.clip_val = clip_val + self.mel_basis = {} + self.hann_window = {} + + def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): + sample_rate = self.target_sr + n_mels = self.n_mels + n_fft = self.n_fft + win_size = self.win_size + hop_length = self.hop_length + fmin = self.fmin + fmax = self.fmax + clip_val = self.clip_val + + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(n_fft * factor)) + win_size_new = int(np.round(win_size * factor)) + hop_length_new = int(np.round(hop_length * speed)) + + # Optimize mel_basis and hann_window caching + mel_basis = self.mel_basis if not train else {} + hann_window = self.hann_window if not train else {} + + mel_basis_key = str(fmax) + "_" + str(y.device) + if mel_basis_key not in mel_basis: + mel = librosa_mel_fn( + sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax + ) + mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device) + + keyshift_key = str(keyshift) + "_" + str(y.device) + if keyshift_key not in hann_window: + hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device) + + # Padding and STFT + pad_left = (win_size_new - hop_length_new) // 2 + pad_right = max( + (win_size_new - hop_length_new + 1) // 2, + win_size_new - y.size(-1) - pad_left, + ) + mode = "reflect" if pad_right < y.size(-1) else "constant" + y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft_new, + hop_length=hop_length_new, + win_length=win_size_new, + window=hann_window[keyshift_key], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9)) + + # Handle keyshift and mel conversion + if keyshift != 0: + size = n_fft // 2 + 1 + resize = spec.size(1) + spec = ( + F.pad(spec, (0, 0, 0, size - resize)) + if resize < size + else spec[:, :size, :] + ) + spec = spec * win_size / win_size_new + spec = torch.matmul(mel_basis[mel_basis_key], spec) + spec = dynamic_range_compression_torch(spec, clip_val=clip_val) + return spec + + def __call__(self, audiopath): + audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) + spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) + return spect + + +stft = STFT() + + +def softmax_kernel( + data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None +): + b, h, *_ = data.shape + + # Normalize data + data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0 + + # Project data + ratio = projection_matrix.shape[0] ** -0.5 + projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h) + projection = projection.type_as(data) + data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection) + + # Calculate diagonal data + diag_data = data**2 + diag_data = torch.sum(diag_data, dim=-1) + diag_data = (diag_data / 2.0) * (data_normalizer**2) + diag_data = diag_data.unsqueeze(dim=-1) + + # Apply softmax + if is_query: + data_dash = ratio * ( + torch.exp( + data_dash + - diag_data + - torch.max(data_dash, dim=-1, keepdim=True).values + ) + + eps + ) + else: + data_dash = ratio * (torch.exp(data_dash - diag_data + eps)) + + return data_dash.type_as(data) + + +def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None): + unstructured_block = torch.randn((cols, cols), device=device) + q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced") + q, r = map(lambda t: t.to(device), (q, r)) + + if qr_uniform_q: + d = torch.diag(r, 0) + q *= d.sign() + return q.t() + + +def exists(val): + return val is not None + + +def empty(tensor): + return tensor.numel() == 0 + + +def default(val, d): + return val if exists(val) else d + + +def cast_tuple(val): + return (val,) if not isinstance(val, tuple) else val + + +class PCmer(nn.Module): + def __init__( + self, + num_layers, + num_heads, + dim_model, + dim_keys, + dim_values, + residual_dropout, + attention_dropout, + ): + super().__init__() + self.num_layers = num_layers + self.num_heads = num_heads + self.dim_model = dim_model + self.dim_values = dim_values + self.dim_keys = dim_keys + self.residual_dropout = residual_dropout + self.attention_dropout = attention_dropout + + self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)]) + + def forward(self, phone, mask=None): + for layer in self._layers: + phone = layer(phone, mask) + return phone + + +class _EncoderLayer(nn.Module): + def __init__(self, parent: PCmer): + super().__init__() + self.conformer = ConformerConvModule(parent.dim_model) + self.norm = nn.LayerNorm(parent.dim_model) + self.dropout = nn.Dropout(parent.residual_dropout) + self.attn = SelfAttention( + dim=parent.dim_model, heads=parent.num_heads, causal=False + ) + + def forward(self, phone, mask=None): + phone = phone + (self.attn(self.norm(phone), mask=mask)) + phone = phone + (self.conformer(phone)) + return phone + + +def calc_same_padding(kernel_size): + pad = kernel_size // 2 + return (pad, pad - (kernel_size + 1) % 2) + + +class Swish(nn.Module): + def forward(self, x): + return x * x.sigmoid() + + +class Transpose(nn.Module): + def __init__(self, dims): + super().__init__() + assert len(dims) == 2, "dims must be a tuple of two dimensions" + self.dims = dims + + def forward(self, x): + return x.transpose(*self.dims) + + +class GLU(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + out, gate = x.chunk(2, dim=self.dim) + return out * gate.sigmoid() + + +class DepthWiseConv1d(nn.Module): + def __init__(self, chan_in, chan_out, kernel_size, padding): + super().__init__() + self.padding = padding + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in) + + def forward(self, x): + x = F.pad(x, self.padding) + return self.conv(x) + + +class ConformerConvModule(nn.Module): + def __init__( + self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0 + ): + super().__init__() + + inner_dim = dim * expansion_factor + padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0) + + self.net = nn.Sequential( + nn.LayerNorm(dim), + Transpose((1, 2)), + nn.Conv1d(dim, inner_dim * 2, 1), + GLU(dim=1), + DepthWiseConv1d( + inner_dim, inner_dim, kernel_size=kernel_size, padding=padding + ), + Swish(), + nn.Conv1d(inner_dim, dim, 1), + Transpose((1, 2)), + nn.Dropout(dropout), + ) + + def forward(self, x): + return self.net(x) + + +def linear_attention(q, k, v): + if v is None: + out = torch.einsum("...ed,...nd->...ne", k, q) + return out + else: + k_cumsum = k.sum(dim=-2) + D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8) + context = torch.einsum("...nd,...ne->...de", k, v) + out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv) + return out + + +def gaussian_orthogonal_random_matrix( + nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None +): + nb_full_blocks = int(nb_rows / nb_columns) + block_list = [] + + for _ in range(nb_full_blocks): + q = orthogonal_matrix_chunk( + nb_columns, qr_uniform_q=qr_uniform_q, device=device + ) + block_list.append(q) + + remaining_rows = nb_rows - nb_full_blocks * nb_columns + if remaining_rows > 0: + q = orthogonal_matrix_chunk( + nb_columns, qr_uniform_q=qr_uniform_q, device=device + ) + block_list.append(q[:remaining_rows]) + + final_matrix = torch.cat(block_list) + + if scaling == 0: + multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1) + elif scaling == 1: + multiplier = math.sqrt((float(nb_columns))) * torch.ones( + (nb_rows,), device=device + ) + else: + raise ValueError(f"Invalid scaling {scaling}") + + return torch.diag(multiplier) @ final_matrix + + +class FastAttention(nn.Module): + def __init__( + self, + dim_heads, + nb_features=None, + ortho_scaling=0, + causal=False, + generalized_attention=False, + kernel_fn=nn.ReLU(), + qr_uniform_q=False, + no_projection=False, + ): + super().__init__() + nb_features = default(nb_features, int(dim_heads * math.log(dim_heads))) + + self.dim_heads = dim_heads + self.nb_features = nb_features + self.ortho_scaling = ortho_scaling + + self.create_projection = partial( + gaussian_orthogonal_random_matrix, + nb_rows=self.nb_features, + nb_columns=dim_heads, + scaling=ortho_scaling, + qr_uniform_q=qr_uniform_q, + ) + projection_matrix = self.create_projection() + self.register_buffer("projection_matrix", projection_matrix) + + self.generalized_attention = generalized_attention + self.kernel_fn = kernel_fn + self.no_projection = no_projection + self.causal = causal + + @torch.no_grad() + def redraw_projection_matrix(self): + projections = self.create_projection() + self.projection_matrix.copy_(projections) + del projections + + def forward(self, q, k, v): + device = q.device + + if self.no_projection: + q = q.softmax(dim=-1) + k = torch.exp(k) if self.causal else k.softmax(dim=-2) + else: + create_kernel = partial( + softmax_kernel, projection_matrix=self.projection_matrix, device=device + ) + q = create_kernel(q, is_query=True) + k = create_kernel(k, is_query=False) + + attn_fn = linear_attention if not self.causal else self.causal_linear_fn + + if v is None: + out = attn_fn(q, k, None) + return out + else: + out = attn_fn(q, k, v) + return out + + +class SelfAttention(nn.Module): + def __init__( + self, + dim, + causal=False, + heads=8, + dim_head=64, + local_heads=0, + local_window_size=256, + nb_features=None, + feature_redraw_interval=1000, + generalized_attention=False, + kernel_fn=nn.ReLU(), + qr_uniform_q=False, + dropout=0.0, + no_projection=False, + ): + super().__init__() + assert dim % heads == 0, "dimension must be divisible by number of heads" + dim_head = default(dim_head, dim // heads) + inner_dim = dim_head * heads + self.fast_attention = FastAttention( + dim_head, + nb_features, + causal=causal, + generalized_attention=generalized_attention, + kernel_fn=kernel_fn, + qr_uniform_q=qr_uniform_q, + no_projection=no_projection, + ) + + self.heads = heads + self.global_heads = heads - local_heads + self.local_attn = ( + LocalAttention( + window_size=local_window_size, + causal=causal, + autopad=True, + dropout=dropout, + look_forward=int(not causal), + rel_pos_emb_config=(dim_head, local_heads), + ) + if local_heads > 0 + else None + ) + + self.to_q = nn.Linear(dim, inner_dim) + self.to_k = nn.Linear(dim, inner_dim) + self.to_v = nn.Linear(dim, inner_dim) + self.to_out = nn.Linear(inner_dim, dim) + self.dropout = nn.Dropout(dropout) + + @torch.no_grad() + def redraw_projection_matrix(self): + self.fast_attention.redraw_projection_matrix() + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + name=None, + inference=False, + **kwargs, + ): + _, _, _, h, gh = *x.shape, self.heads, self.global_heads + + cross_attend = exists(context) + context = default(context, x) + context_mask = default(context_mask, mask) if not cross_attend else context_mask + q, k, v = self.to_q(x), self.to_k(context), self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v)) + (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v)) + + attn_outs = [] + if not empty(q): + if exists(context_mask): + global_mask = context_mask[:, None, :, None] + v.masked_fill_(~global_mask, 0.0) + if cross_attend: + pass # TODO: Implement cross-attention + else: + out = self.fast_attention(q, k, v) + attn_outs.append(out) + + if not empty(lq): + assert ( + not cross_attend + ), "local attention is not compatible with cross attention" + out = self.local_attn(lq, lk, lv, input_mask=mask) + attn_outs.append(out) + + out = torch.cat(attn_outs, dim=1) + out = rearrange(out, "b h n d -> b n (h d)") + out = self.to_out(out) + return self.dropout(out) + + +def l2_regularization(model, l2_alpha): + l2_loss = [] + for module in model.modules(): + if type(module) is nn.Conv2d: + l2_loss.append((module.weight**2).sum() / 2.0) + return l2_alpha * sum(l2_loss) + + +class FCPE(nn.Module): + def __init__( + self, + input_channel=128, + out_dims=360, + n_layers=12, + n_chans=512, + use_siren=False, + use_full=False, + loss_mse_scale=10, + loss_l2_regularization=False, + loss_l2_regularization_scale=1, + loss_grad1_mse=False, + loss_grad1_mse_scale=1, + f0_max=1975.5, + f0_min=32.70, + confidence=False, + threshold=0.05, + use_input_conv=True, + ): + super().__init__() + if use_siren is True: + raise ValueError("Siren is not supported yet.") + if use_full is True: + raise ValueError("Full model is not supported yet.") + + self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10 + self.loss_l2_regularization = ( + loss_l2_regularization if (loss_l2_regularization is not None) else False + ) + self.loss_l2_regularization_scale = ( + loss_l2_regularization_scale + if (loss_l2_regularization_scale is not None) + else 1 + ) + self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False + self.loss_grad1_mse_scale = ( + loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1 + ) + self.f0_max = f0_max if (f0_max is not None) else 1975.5 + self.f0_min = f0_min if (f0_min is not None) else 32.70 + self.confidence = confidence if (confidence is not None) else False + self.threshold = threshold if (threshold is not None) else 0.05 + self.use_input_conv = use_input_conv if (use_input_conv is not None) else True + + self.cent_table_b = torch.Tensor( + np.linspace( + self.f0_to_cent(torch.Tensor([f0_min]))[0], + self.f0_to_cent(torch.Tensor([f0_max]))[0], + out_dims, + ) + ) + self.register_buffer("cent_table", self.cent_table_b) + + # conv in stack + _leaky = nn.LeakyReLU() + self.stack = nn.Sequential( + nn.Conv1d(input_channel, n_chans, 3, 1, 1), + nn.GroupNorm(4, n_chans), + _leaky, + nn.Conv1d(n_chans, n_chans, 3, 1, 1), + ) + + # transformer + self.decoder = PCmer( + num_layers=n_layers, + num_heads=8, + dim_model=n_chans, + dim_keys=n_chans, + dim_values=n_chans, + residual_dropout=0.1, + attention_dropout=0.1, + ) + self.norm = nn.LayerNorm(n_chans) + + # out + self.n_out = out_dims + self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out)) + + def forward( + self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax" + ): + if cdecoder == "argmax": + self.cdecoder = self.cents_decoder + elif cdecoder == "local_argmax": + self.cdecoder = self.cents_local_decoder + + x = ( + self.stack(mel.transpose(1, 2)).transpose(1, 2) + if self.use_input_conv + else mel + ) + x = self.decoder(x) + x = self.norm(x) + x = self.dense_out(x) + x = torch.sigmoid(x) + + if not infer: + gt_cent_f0 = self.f0_to_cent(gt_f0) + gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0) + loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0) + if self.loss_l2_regularization: + loss_all = loss_all + l2_regularization( + model=self, l2_alpha=self.loss_l2_regularization_scale + ) + x = loss_all + if infer: + x = self.cdecoder(x) + x = self.cent_to_f0(x) + x = (1 + x / 700).log() if not return_hz_f0 else x + + return x + + def cents_decoder(self, y, mask=True): + B, N, _ = y.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum( + y, dim=-1, keepdim=True + ) + if mask: + confident = torch.max(y, dim=-1, keepdim=True)[0] + confident_mask = torch.ones_like(confident) + confident_mask[confident <= self.threshold] = float("-INF") + rtn = rtn * confident_mask + return (rtn, confident) if self.confidence else rtn + + def cents_local_decoder(self, y, mask=True): + B, N, _ = y.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + confident, max_index = torch.max(y, dim=-1, keepdim=True) + local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4) + local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1) + ci_l = torch.gather(ci, -1, local_argmax_index) + y_l = torch.gather(y, -1, local_argmax_index) + rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum( + y_l, dim=-1, keepdim=True + ) + if mask: + confident_mask = torch.ones_like(confident) + confident_mask[confident <= self.threshold] = float("-INF") + rtn = rtn * confident_mask + return (rtn, confident) if self.confidence else rtn + + def cent_to_f0(self, cent): + return 10.0 * 2 ** (cent / 1200.0) + + def f0_to_cent(self, f0): + return 1200.0 * torch.log2(f0 / 10.0) + + def gaussian_blurred_cent(self, cents): + mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0))) + B, N, _ = cents.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + return torch.exp(-torch.square(ci - cents) / 1250) * mask.float() + + +class FCPEInfer: + def __init__(self, model_path, device=None, dtype=torch.float32): + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + ckpt = torch.load(model_path, map_location=torch.device(self.device)) + self.args = DotDict(ckpt["config"]) + self.dtype = dtype + model = FCPE( + input_channel=self.args.model.input_channel, + out_dims=self.args.model.out_dims, + n_layers=self.args.model.n_layers, + n_chans=self.args.model.n_chans, + use_siren=self.args.model.use_siren, + use_full=self.args.model.use_full, + loss_mse_scale=self.args.loss.loss_mse_scale, + loss_l2_regularization=self.args.loss.loss_l2_regularization, + loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale, + loss_grad1_mse=self.args.loss.loss_grad1_mse, + loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale, + f0_max=self.args.model.f0_max, + f0_min=self.args.model.f0_min, + confidence=self.args.model.confidence, + ) + model.to(self.device).to(self.dtype) + model.load_state_dict(ckpt["model"]) + model.eval() + self.model = model + self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device) + + @torch.no_grad() + def __call__(self, audio, sr, threshold=0.05): + self.model.threshold = threshold + audio = audio[None, :] + mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype) + f0 = self.model(mel=mel, infer=True, return_hz_f0=True) + return f0 + + +class Wav2Mel: + def __init__(self, args, device=None, dtype=torch.float32): + self.sample_rate = args.mel.sampling_rate + self.hop_size = args.mel.hop_size + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + self.dtype = dtype + self.stft = STFT( + args.mel.sampling_rate, + args.mel.num_mels, + args.mel.n_fft, + args.mel.win_size, + args.mel.hop_size, + args.mel.fmin, + args.mel.fmax, + ) + self.resample_kernel = {} + + def extract_nvstft(self, audio, keyshift=0, train=False): + mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2) + return mel + + def extract_mel(self, audio, sample_rate, keyshift=0, train=False): + audio = audio.to(self.dtype).to(self.device) + if sample_rate == self.sample_rate: + audio_res = audio + else: + key_str = str(sample_rate) + if key_str not in self.resample_kernel: + self.resample_kernel[key_str] = Resample( + sample_rate, self.sample_rate, lowpass_filter_width=128 + ) + self.resample_kernel[key_str] = ( + self.resample_kernel[key_str].to(self.dtype).to(self.device) + ) + audio_res = self.resample_kernel[key_str](audio) + + mel = self.extract_nvstft( + audio_res, keyshift=keyshift, train=train + ) # B, n_frames, bins + n_frames = int(audio.shape[1] // self.hop_size) + 1 + mel = ( + torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel + ) + mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel + return mel + + def __call__(self, audio, sample_rate, keyshift=0, train=False): + return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train) + + +class DotDict(dict): + def __getattr__(*args): + val = dict.get(*args) + return DotDict(val) if type(val) is dict else val + + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + +class F0Predictor(object): + def compute_f0(self, wav, p_len): + pass + + def compute_f0_uv(self, wav, p_len): + pass + + +class FCPEF0Predictor(F0Predictor): + def __init__( + self, + model_path, + hop_length=512, + f0_min=50, + f0_max=1100, + dtype=torch.float32, + device=None, + sample_rate=44100, + threshold=0.05, + ): + self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype) + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.threshold = threshold + self.sample_rate = sample_rate + self.dtype = dtype + self.name = "fcpe" + + def repeat_expand( + self, + content: Union[torch.Tensor, np.ndarray], + target_len: int, + mode: str = "nearest", + ): + ndim = content.ndim + content = ( + content[None, None] + if ndim == 1 + else content[None] if ndim == 2 else content + ) + assert content.ndim == 3 + is_np = isinstance(content, np.ndarray) + content = torch.from_numpy(content) if is_np else content + results = torch.nn.functional.interpolate(content, size=target_len, mode=mode) + results = results.numpy() if is_np else results + return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results + + def post_process(self, x, sample_rate, f0, pad_to): + f0 = ( + torch.from_numpy(f0).float().to(x.device) + if isinstance(f0, np.ndarray) + else f0 + ) + f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0 + + vuv_vector = torch.zeros_like(f0) + vuv_vector[f0 > 0.0] = 1.0 + vuv_vector[f0 <= 0.0] = 0.0 + + nzindex = torch.nonzero(f0).squeeze() + f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() + time_org = self.hop_length / sample_rate * nzindex.cpu().numpy() + time_frame = np.arange(pad_to) * self.hop_length / sample_rate + + vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] + + if f0.shape[0] <= 0: + return np.zeros(pad_to), vuv_vector.cpu().numpy() + if f0.shape[0] == 1: + return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy() + + f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) + return f0, vuv_vector.cpu().numpy() + + def compute_f0(self, wav, p_len=None): + x = torch.FloatTensor(wav).to(self.dtype).to(self.device) + p_len = x.shape[0] // self.hop_length if p_len is None else p_len + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] + if torch.all(f0 == 0): + return f0.cpu().numpy() if p_len is None else np.zeros(p_len), ( + f0.cpu().numpy() if p_len is None else np.zeros(p_len) + ) + return self.post_process(x, self.sample_rate, f0, p_len)[0] + + def compute_f0_uv(self, wav, p_len=None): + x = torch.FloatTensor(wav).to(self.dtype).to(self.device) + p_len = x.shape[0] // self.hop_length if p_len is None else p_len + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] + if torch.all(f0 == 0): + return f0.cpu().numpy() if p_len is None else np.zeros(p_len), ( + f0.cpu().numpy() if p_len is None else np.zeros(p_len) + ) + return self.post_process(x, self.sample_rate, f0, p_len) diff --git a/rvc/lib/predictors/RMVPE.py b/rvc/lib/predictors/RMVPE.py new file mode 100644 index 00000000..970c5e58 --- /dev/null +++ b/rvc/lib/predictors/RMVPE.py @@ -0,0 +1,560 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +from librosa.filters import mel +from typing import List + +# Constants for readability +N_MELS = 128 +N_CLASS = 360 + + +# Define a helper function for creating convolutional blocks +class ConvBlockRes(nn.Module): + """ + A convolutional block with residual connection. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, out_channels, momentum=0.01): + super(ConvBlockRes, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + self.is_shortcut = True + else: + self.is_shortcut = False + + def forward(self, x): + if self.is_shortcut: + return self.conv(x) + self.shortcut(x) + else: + return self.conv(x) + x + + +# Define a class for residual encoder blocks +class ResEncoderBlock(nn.Module): + """ + A residual encoder block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in the block. + momentum (float): Momentum for batch normalization. + """ + + def __init__( + self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 + ): + super(ResEncoderBlock, self).__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for _ in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i in range(self.n_blocks): + x = self.conv[i](x) + if self.kernel_size is not None: + return x, self.pool(x) + else: + return x + + +# Define a class for the encoder +class Encoder(nn.Module): + """ + The encoder part of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + in_size (int): Size of the input tensor. + n_encoders (int): Number of encoder blocks. + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in each encoder block. + out_channels (int): Number of output channels for the first encoder block. + momentum (float): Momentum for batch normalization. + """ + + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super(Encoder, self).__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, out_channels, kernel_size, n_blocks, momentum=momentum + ) + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x: torch.Tensor): + concat_tensors: List[torch.Tensor] = [] + x = self.bn(x) + for i in range(self.n_encoders): + t, x = self.layers[i](x) + concat_tensors.append(t) + return x, concat_tensors + + +# Define a class for the intermediate layer +class Intermediate(nn.Module): + """ + The intermediate layer of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + n_inters (int): Number of convolutional blocks in the intermediate layer. + n_blocks (int): Number of convolutional blocks in each intermediate block. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super(Intermediate, self).__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) + ) + for _ in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) + ) + + def forward(self, x): + for i in range(self.n_inters): + x = self.layers[i](x) + return x + + +# Define a class for residual decoder blocks +class ResDecoderBlock(nn.Module): + """ + A residual decoder block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + stride (tuple): Stride for transposed convolution. + n_blocks (int): Number of convolutional blocks in the block. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super(ResDecoderBlock, self).__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for _ in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i in range(self.n_blocks): + x = self.conv2[i](x) + return x + + +# Define a class for the decoder +class Decoder(nn.Module): + """ + The decoder part of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + n_decoders (int): Number of decoder blocks. + stride (tuple): Stride for transposed convolution. + n_blocks (int): Number of convolutional blocks in each decoder block. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super(Decoder, self).__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for _ in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) + ) + in_channels = out_channels + + def forward(self, x, concat_tensors): + for i in range(self.n_decoders): + x = self.layers[i](x, concat_tensors[-1 - i]) + return x + + +# Define a class for the DeepUnet architecture +class DeepUnet(nn.Module): + """ + The DeepUnet architecture. + + Args: + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in each encoder/decoder block. + en_de_layers (int): Number of encoder/decoder layers. + inter_layers (int): Number of convolutional blocks in the intermediate layer. + in_channels (int): Number of input channels. + en_out_channels (int): Number of output channels for the first encoder block. + """ + + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(DeepUnet, self).__init__() + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, en_de_layers, kernel_size, n_blocks + ) + + def forward(self, x): + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +# Define a class for the end-to-end model +class E2E(nn.Module): + """ + The end-to-end model. + + Args: + n_blocks (int): Number of convolutional blocks in each encoder/decoder block. + n_gru (int): Number of GRU layers. + kernel_size (tuple): Size of the average pooling kernel. + en_de_layers (int): Number of encoder/decoder layers. + inter_layers (int): Number of convolutional blocks in the intermediate layer. + in_channels (int): Number of input channels. + en_out_channels (int): Number of output channels for the first encoder block. + """ + + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(E2E, self).__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, N_CLASS), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + ) + + def forward(self, mel): + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + return x + + +# Define a class for the MelSpectrogram extractor +class MelSpectrogram(torch.nn.Module): + """ + Extracts Mel-spectrogram features from audio. + + Args: + is_half (bool): Whether to use half-precision floating-point numbers. + n_mel_channels (int): Number of Mel-frequency bands. + sample_rate (int): Sampling rate of the audio. + win_length (int): Length of the window function in samples. + hop_length (int): Hop size between frames in samples. + n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length. + mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0. + mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None. + clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5. + """ + + def __init__( + self, + is_half, + n_mel_channels, + sample_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sample_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sample_rate = sample_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + self.is_half = is_half + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + audio.device + ) + fft = torch.stft( + audio, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ) + + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + if self.is_half: + mel_output = mel_output.half() + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + return log_mel_spec + + +# Define a class for the RMVPE0 predictor +class RMVPE0Predictor: + """ + A predictor for fundamental frequency (F0) based on the RMVPE0 model. + + Args: + model_path (str): Path to the RMVPE0 model file. + is_half (bool): Whether to use half-precision floating-point numbers. + device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available. + """ + + def __init__(self, model_path, is_half, device=None): + self.resample_kernel = {} + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu") + model.load_state_dict(ckpt) + model.eval() + if is_half: + model = model.half() + self.model = model + self.resample_kernel = {} + self.is_half = is_half + self.device = device + self.mel_extractor = MelSpectrogram( + is_half, N_MELS, 16000, 1024, 160, None, 30, 8000 + ).to(device) + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) + + def mel2hidden(self, mel): + """ + Converts Mel-spectrogram features to hidden representation. + + Args: + mel (torch.Tensor): Mel-spectrogram features. + """ + with torch.no_grad(): + n_frames = mel.shape[-1] + mel = F.pad( + mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" + ) + hidden = self.model(mel) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + """ + Decodes hidden representation to F0. + + Args: + hidden (np.ndarray): Hidden representation. + thred (float, optional): Threshold for salience. Defaults to 0.03. + """ + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + return f0 + + def infer_from_audio(self, audio, thred=0.03): + """ + Infers F0 from audio. + + Args: + audio (np.ndarray): Audio signal. + thred (float, optional): Threshold for salience. Defaults to 0.03. + """ + audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) + mel = self.mel_extractor(audio, center=True) + hidden = self.mel2hidden(mel) + hidden = hidden.squeeze(0).cpu().numpy() + if self.is_half == True: + hidden = hidden.astype("float32") + f0 = self.decode(hidden, thred=thred) + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + """ + Converts salience to local average cents. + + Args: + salience (np.ndarray): Salience values. + thred (float, optional): Threshold for salience. Defaults to 0.05. + """ + center = np.argmax(salience, axis=1) + salience = np.pad(salience, ((0, 0), (4, 4))) + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + todo_salience = np.array(todo_salience) + todo_cents_mapping = np.array(todo_cents_mapping) + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) + devided = product_sum / weight_sum + maxx = np.max(salience, axis=1) + devided[maxx <= thred] = 0 + return devided + + +# Define a class for BiGRU (bidirectional GRU) +class BiGRU(nn.Module): + """ + A bidirectional GRU layer. + + Args: + input_features (int): Number of input features. + hidden_features (int): Number of hidden features. + num_layers (int): Number of GRU layers. + """ + + def __init__(self, input_features, hidden_features, num_layers): + super(BiGRU, self).__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] diff --git a/rvc/lib/tools/analyzer.py b/rvc/lib/tools/analyzer.py new file mode 100644 index 00000000..f4b79434 --- /dev/null +++ b/rvc/lib/tools/analyzer.py @@ -0,0 +1,76 @@ +import numpy as np +import matplotlib.pyplot as plt +import librosa.display +import librosa + + +def calculate_features(y, sr): + stft = np.abs(librosa.stft(y)) + duration = librosa.get_duration(y=y, sr=sr) + cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0] + bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0] + rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0] + return stft, duration, cent, bw, rolloff + + +def plot_title(title): + plt.suptitle(title, fontsize=16, fontweight="bold") + + +def plot_spectrogram(y, sr, stft, duration, cmap="inferno"): + plt.subplot(3, 1, 1) + plt.imshow( + librosa.amplitude_to_db(stft, ref=np.max), + origin="lower", + extent=[0, duration, 0, sr / 1000], + aspect="auto", + cmap=cmap, # Change the colormap here + ) + plt.colorbar(format="%+2.0f dB") + plt.xlabel("Time (s)") + plt.ylabel("Frequency (kHz)") + plt.title("Spectrogram") + + +def plot_waveform(y, sr, duration): + plt.subplot(3, 1, 2) + librosa.display.waveshow(y, sr=sr) + plt.xlabel("Time (s)") + plt.ylabel("Amplitude") + plt.title("Waveform") + + +def plot_features(times, cent, bw, rolloff, duration): + plt.subplot(3, 1, 3) + plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b") + plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g") + plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r") + plt.xlabel("Time (s)") + plt.title("Spectral Features") + plt.legend() + + +def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"): + y, sr = librosa.load(audio_file) + stft, duration, cent, bw, rolloff = calculate_features(y, sr) + + plt.figure(figsize=(12, 10)) + + plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1]) + plot_spectrogram(y, sr, stft, duration) + plot_waveform(y, sr, duration) + plot_features(librosa.times_like(cent), cent, bw, rolloff, duration) + + plt.tight_layout() + + if save_plot_path: + plt.savefig(save_plot_path, bbox_inches="tight", dpi=300) + plt.close() + + audio_info = f"""Sample Rate: {sr}\nDuration: {( + str(round(duration, 2)) + " seconds" + if duration < 60 + else str(round(duration / 60, 2)) + " minutes" + )}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}""" + + return audio_info, save_plot_path diff --git a/rvc/lib/tools/gdown.py b/rvc/lib/tools/gdown.py new file mode 100644 index 00000000..eb5ca071 --- /dev/null +++ b/rvc/lib/tools/gdown.py @@ -0,0 +1,354 @@ +import os +import re +import six +import sys +import json +import tqdm +import time +import shutil +import warnings +import tempfile +import textwrap +import requests +from six.moves import urllib_parse + + +def indent(text, prefix): + """Indent each non-empty line of text with the given prefix.""" + return "".join( + (prefix + line if line.strip() else line) for line in text.splitlines(True) + ) + + +class FileURLRetrievalError(Exception): + pass + + +class FolderContentsMaximumLimitError(Exception): + pass + + +def parse_url(url, warning=True): + """Parse URLs especially for Google Drive links. + + Args: + url: URL to parse. + warning: Whether to warn if the URL is not a download link. + + Returns: + A tuple (file_id, is_download_link), where file_id is the ID of the + file on Google Drive, and is_download_link is a flag indicating + whether the URL is a download link. + """ + parsed = urllib_parse.urlparse(url) + query = urllib_parse.parse_qs(parsed.query) + is_gdrive = parsed.hostname in ("drive.google.com", "docs.google.com") + is_download_link = parsed.path.endswith("/uc") + + if not is_gdrive: + return None, is_download_link + + file_id = query.get("id", [None])[0] + if file_id is None: + for pattern in ( + r"^/file/d/(.*?)/(edit|view)$", + r"^/file/u/[0-9]+/d/(.*?)/(edit|view)$", + r"^/document/d/(.*?)/(edit|htmlview|view)$", + r"^/document/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", + r"^/presentation/d/(.*?)/(edit|htmlview|view)$", + r"^/presentation/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", + r"^/spreadsheets/d/(.*?)/(edit|htmlview|view)$", + r"^/spreadsheets/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", + ): + match = re.match(pattern, parsed.path) + if match: + file_id = match.group(1) + break + + if warning and not is_download_link: + warnings.warn( + "You specified a Google Drive link that is not the correct link " + "to download a file. You might want to try `--fuzzy` option " + f"or the following url: https://drive.google.com/uc?id={file_id}" + ) + + return file_id, is_download_link + + +CHUNK_SIZE = 512 * 1024 # 512KB +HOME = os.path.expanduser("~") + + +def get_url_from_gdrive_confirmation(contents): + """Extract the download URL from a Google Drive confirmation page.""" + for pattern in ( + r'href="(\/uc\?export=download[^"]+)', + r'href="/open\?id=([^"]+)"', + r'"downloadUrl":"([^"]+)', + ): + match = re.search(pattern, contents) + if match: + url = match.group(1) + if pattern == r'href="/open\?id=([^"]+)"': + uuid = re.search( + r'(.*)

', contents) + if match: + error = match.group(1) + raise FileURLRetrievalError(error) + + raise FileURLRetrievalError( + "Cannot retrieve the public link of the file. " + "You may need to change the permission to " + "'Anyone with the link', or have had many accesses." + ) + + +def _get_session(proxy, use_cookies, return_cookies_file=False): + """Create a requests session with optional proxy and cookie handling.""" + sess = requests.session() + sess.headers.update( + {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"} + ) + + if proxy is not None: + sess.proxies = {"http": proxy, "https": proxy} + print("Using proxy:", proxy, file=sys.stderr) + + cookies_file = os.path.join(HOME, ".cache/gdown/cookies.json") + if os.path.exists(cookies_file) and use_cookies: + with open(cookies_file) as f: + cookies = json.load(f) + for k, v in cookies: + sess.cookies[k] = v + + return (sess, cookies_file) if return_cookies_file else sess + + +def download( + url=None, + output=None, + quiet=False, + proxy=None, + speed=None, + use_cookies=True, + verify=True, + id=None, + fuzzy=True, + resume=False, + format=None, +): + """Download file from URL. + + Parameters + ---------- + url: str + URL. Google Drive URL is also supported. + output: str + Output filename. Default is basename of URL. + quiet: bool + Suppress terminal output. Default is False. + proxy: str + Proxy. + speed: float + Download byte size per second (e.g., 256KB/s = 256 * 1024). + use_cookies: bool + Flag to use cookies. Default is True. + verify: bool or string + Either a bool, in which case it controls whether the server's TLS + certificate is verified, or a string, in which case it must be a path + to a CA bundle to use. Default is True. + id: str + Google Drive's file ID. + fuzzy: bool + Fuzzy extraction of Google Drive's file Id. Default is False. + resume: bool + Resume the download from existing tmp file if possible. + Default is False. + format: str, optional + Format of Google Docs, Spreadsheets and Slides. Default is: + - Google Docs: 'docx' + - Google Spreadsheet: 'xlsx' + - Google Slides: 'pptx' + + Returns + ------- + output: str + Output filename. + """ + if not (id is None) ^ (url is None): + raise ValueError("Either url or id has to be specified") + if id is not None: + url = f"https://drive.google.com/uc?id={id}" + + url_origin = url + + sess, cookies_file = _get_session( + proxy=proxy, use_cookies=use_cookies, return_cookies_file=True + ) + + gdrive_file_id, is_gdrive_download_link = parse_url(url, warning=not fuzzy) + + if fuzzy and gdrive_file_id: + # overwrite the url with fuzzy match of a file id + url = f"https://drive.google.com/uc?id={gdrive_file_id}" + url_origin = url + is_gdrive_download_link = True + + while True: + res = sess.get(url, stream=True, verify=verify) + + if url == url_origin and res.status_code == 500: + # The file could be Google Docs or Spreadsheets. + url = f"https://drive.google.com/open?id={gdrive_file_id}" + continue + + if res.headers["Content-Type"].startswith("text/html"): + title = re.search("(.+)", res.text) + if title: + title = title.group(1) + if title.endswith(" - Google Docs"): + url = f"https://docs.google.com/document/d/{gdrive_file_id}/export?format={'docx' if format is None else format}" + continue + if title.endswith(" - Google Sheets"): + url = f"https://docs.google.com/spreadsheets/d/{gdrive_file_id}/export?format={'xlsx' if format is None else format}" + continue + if title.endswith(" - Google Slides"): + url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}" + continue + elif ( + "Content-Disposition" in res.headers + and res.headers["Content-Disposition"].endswith("pptx") + and format not in (None, "pptx") + ): + url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}" + continue + + if use_cookies: + os.makedirs(os.path.dirname(cookies_file), exist_ok=True) + with open(cookies_file, "w") as f: + cookies = [ + (k, v) + for k, v in sess.cookies.items() + if not k.startswith("download_warning_") + ] + json.dump(cookies, f, indent=2) + + if "Content-Disposition" in res.headers: + # This is the file + break + if not (gdrive_file_id and is_gdrive_download_link): + break + + # Need to redirect with confirmation + try: + url = get_url_from_gdrive_confirmation(res.text) + except FileURLRetrievalError as e: + message = ( + "Failed to retrieve file url:\n\n" + "{}\n\n" + "You may still be able to access the file from the browser:" + f"\n\n\t{url_origin}\n\n" + "but Gdown can't. Please check connections and permissions." + ).format(indent("\n".join(textwrap.wrap(str(e))), prefix="\t")) + raise FileURLRetrievalError(message) + + if gdrive_file_id and is_gdrive_download_link: + content_disposition = urllib_parse.unquote(res.headers["Content-Disposition"]) + filename_from_url = ( + re.search(r"filename\*=UTF-8''(.*)", content_disposition) + or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition) + ).group(1) + filename_from_url = filename_from_url.replace(os.path.sep, "_") + else: + filename_from_url = os.path.basename(url) + + output = output or filename_from_url + + output_is_path = isinstance(output, six.string_types) + if output_is_path and output.endswith(os.path.sep): + os.makedirs(output, exist_ok=True) + output = os.path.join(output, filename_from_url) + + if output_is_path: + temp_dir = os.path.dirname(output) or "." + prefix = os.path.basename(output) + existing_tmp_files = [ + os.path.join(temp_dir, file) + for file in os.listdir(temp_dir) + if file.startswith(prefix) + ] + if resume and existing_tmp_files: + if len(existing_tmp_files) > 1: + print( + "There are multiple temporary files to resume:", + file=sys.stderr, + ) + for file in existing_tmp_files: + print(f"\t{file}", file=sys.stderr) + print( + "Please remove them except one to resume downloading.", + file=sys.stderr, + ) + return + tmp_file = existing_tmp_files[0] + else: + resume = False + tmp_file = tempfile.mktemp( + suffix=tempfile.template, prefix=prefix, dir=temp_dir + ) + f = open(tmp_file, "ab") + else: + tmp_file = None + f = output + + if tmp_file is not None and f.tell() != 0: + headers = {"Range": f"bytes={f.tell()}-"} + res = sess.get(url, headers=headers, stream=True, verify=verify) + + if not quiet: + if resume: + print("Resume:", tmp_file, file=sys.stderr) + print( + "To:", + os.path.abspath(output) if output_is_path else output, + file=sys.stderr, + ) + + try: + total = int(res.headers.get("Content-Length", 0)) + if not quiet: + pbar = tqdm.tqdm(total=total, unit="B", unit_scale=True) + t_start = time.time() + for chunk in res.iter_content(chunk_size=CHUNK_SIZE): + f.write(chunk) + if not quiet: + pbar.update(len(chunk)) + if speed is not None: + elapsed_time_expected = 1.0 * pbar.n / speed + elapsed_time = time.time() - t_start + if elapsed_time < elapsed_time_expected: + time.sleep(elapsed_time_expected - elapsed_time) + if not quiet: + pbar.close() + if tmp_file: + f.close() + shutil.move(tmp_file, output) + finally: + sess.close() + + return output diff --git a/rvc/lib/tools/launch_tensorboard.py b/rvc/lib/tools/launch_tensorboard.py new file mode 100644 index 00000000..7f74e316 --- /dev/null +++ b/rvc/lib/tools/launch_tensorboard.py @@ -0,0 +1,21 @@ +import time +import logging +from tensorboard import program + +log_path = "logs" + + +def launch_tensorboard_pipeline(): + logging.getLogger("root").setLevel(logging.WARNING) + logging.getLogger("tensorboard").setLevel(logging.WARNING) + + tb = program.TensorBoard() + tb.configure(argv=[None, "--logdir", log_path]) + url = tb.launch() + + print( + f"Access the tensorboard using the following link:\n{url}?pinnedCards=%5B%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fd%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fkl%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fmel%22%7D%5D" + ) + + while True: + time.sleep(600) diff --git a/rvc/lib/tools/model_download.py b/rvc/lib/tools/model_download.py new file mode 100644 index 00000000..ab1b136e --- /dev/null +++ b/rvc/lib/tools/model_download.py @@ -0,0 +1,385 @@ +import os +import re +import six +import sys +import wget +import shutil +import zipfile +import requests +from bs4 import BeautifulSoup +from urllib.parse import unquote, urlencode, parse_qs, urlparse + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from rvc.lib.utils import format_title +from rvc.lib.tools import gdown + + +def find_folder_parent(search_dir, folder_name): + for dirpath, dirnames, _ in os.walk(search_dir): + if folder_name in dirnames: + return os.path.abspath(dirpath) + return None + + +file_path = find_folder_parent(now_dir, "logs") +zips_path = os.path.join(file_path, "zips") + + +def search_pth_index(folder): + pth_paths = [ + os.path.join(folder, file) + for file in os.listdir(folder) + if os.path.isfile(os.path.join(folder, file)) and file.endswith(".pth") + ] + index_paths = [ + os.path.join(folder, file) + for file in os.listdir(folder) + if os.path.isfile(os.path.join(folder, file)) and file.endswith(".index") + ] + + return pth_paths, index_paths + + +def get_mediafire_download_link(url): + response = requests.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + download_button = soup.find( + "a", {"class": "input popsok", "aria-label": "Download file"} + ) + if download_button: + download_link = download_button.get("href") + return download_link + else: + return None + + +def download_from_url(url): + os.makedirs(zips_path, exist_ok=True) + if url != "": + if "drive.google.com" in url: + if "file/d/" in url: + file_id = url.split("file/d/")[1].split("/")[0] + elif "id=" in url: + file_id = url.split("id=")[1].split("&")[0] + else: + return None + + if file_id: + os.chdir(zips_path) + try: + gdown.download( + f"https://drive.google.com/uc?id={file_id}", + quiet=True, + fuzzy=True, + ) + except Exception as error: + error_message = str( + f"An error occurred downloading the file: {error}" + ) + if ( + "Too many users have viewed or downloaded this file recently" + in error_message + ): + os.chdir(now_dir) + return "too much use" + elif ( + "Cannot retrieve the public link of the file." in error_message + ): + os.chdir(now_dir) + return "private link" + else: + print(error_message) + os.chdir(now_dir) + return None + elif "disk.yandex.ru" in url: + base_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?" + public_key = url + final_url = base_url + urlencode(dict(public_key=public_key)) + response = requests.get(final_url) + download_url = response.json()["href"] + download_response = requests.get(download_url) + + if download_response.status_code == 200: + filename = parse_qs(urlparse(unquote(download_url)).query).get( + "filename", [""] + )[0] + if filename: + os.chdir(zips_path) + with open(filename, "wb") as f: + f.write(download_response.content) + else: + print("Failed to get filename from URL.") + return None + + elif "pixeldrain.com" in url: + try: + file_id = url.split("pixeldrain.com/u/")[1] + os.chdir(zips_path) + print(file_id) + response = requests.get(f"https://pixeldrain.com/api/file/{file_id}") + if response.status_code == 200: + file_name = ( + response.headers.get("Content-Disposition") + .split("filename=")[-1] + .strip('";') + ) + os.makedirs(zips_path, exist_ok=True) + with open(os.path.join(zips_path, file_name), "wb") as newfile: + newfile.write(response.content) + os.chdir(file_path) + return "downloaded" + else: + os.chdir(file_path) + return None + except Exception as error: + print(f"An error occurred downloading the file: {error}") + os.chdir(file_path) + return None + + elif "cdn.discordapp.com" in url: + file = requests.get(url) + os.chdir(zips_path) + if file.status_code == 200: + name = url.split("/") + with open(os.path.join(name[-1]), "wb") as newfile: + newfile.write(file.content) + else: + return None + elif "/blob/" in url or "/resolve/" in url: + os.chdir(zips_path) + if "/blob/" in url: + url = url.replace("/blob/", "/resolve/") + + response = requests.get(url, stream=True) + if response.status_code == 200: + content_disposition = six.moves.urllib_parse.unquote( + response.headers["Content-Disposition"] + ) + m = re.search(r'filename="([^"]+)"', content_disposition) + file_name = m.groups()[0] + file_name = file_name.replace(os.path.sep, "_") + total_size_in_bytes = int(response.headers.get("content-length", 0)) + block_size = 1024 + progress_bar_length = 50 + progress = 0 + + with open(os.path.join(zips_path, file_name), "wb") as file: + for data in response.iter_content(block_size): + file.write(data) + progress += len(data) + progress_percent = int((progress / total_size_in_bytes) * 100) + num_dots = int( + (progress / total_size_in_bytes) * progress_bar_length + ) + progress_bar = ( + "[" + + "." * num_dots + + " " * (progress_bar_length - num_dots) + + "]" + ) + print( + f"{progress_percent}% {progress_bar} {progress}/{total_size_in_bytes} ", + end="\r", + ) + if progress_percent == 100: + print("\n") + + else: + os.chdir(now_dir) + return None + elif "/tree/main" in url: + os.chdir(zips_path) + response = requests.get(url) + soup = BeautifulSoup(response.content, "html.parser") + temp_url = "" + for link in soup.find_all("a", href=True): + if link["href"].endswith(".zip"): + temp_url = link["href"] + break + if temp_url: + url = temp_url + url = url.replace("blob", "resolve") + if "huggingface.co" not in url: + url = "https://huggingface.co" + url + + wget.download(url) + else: + os.chdir(now_dir) + return None + elif "applio.org" in url: + parts = url.split("/") + id_with_query = parts[-1] + id_parts = id_with_query.split("?") + id_number = id_parts[0] + + url = "https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models" + headers = { + "apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10" + } + + params = {"id": f"eq.{id_number}"} + response = requests.get(url, headers=headers, params=params) + if response.status_code == 200: + json_response = response.json() + print(json_response) + if json_response: + link = json_response[0]["link"] + verify = download_from_url(link) + if verify == "downloaded": + return "downloaded" + else: + return None + else: + return None + else: + try: + os.chdir(zips_path) + wget.download(url) + except Exception as error: + os.chdir(now_dir) + print(f"An error occurred downloading the file: {error}") + return None + + for currentPath, _, zipFiles in os.walk(zips_path): + for Files in zipFiles: + filePart = Files.split(".") + extensionFile = filePart[len(filePart) - 1] + filePart.pop() + nameFile = "_".join(filePart) + realPath = os.path.join(currentPath, Files) + os.rename(realPath, nameFile + "." + extensionFile) + + os.chdir(now_dir) + return "downloaded" + + os.chdir(now_dir) + return None + + +def extract_and_show_progress(zipfile_path, unzips_path): + try: + with zipfile.ZipFile(zipfile_path, "r") as zip_ref: + for file_info in zip_ref.infolist(): + zip_ref.extract(file_info, unzips_path) + os.remove(zipfile_path) + return True + except Exception as error: + print(f"An error occurred extracting the zip file: {error}") + return False + + +def unzip_file(zip_path, zip_file_name): + zip_file_path = os.path.join(zip_path, zip_file_name + ".zip") + extract_path = os.path.join(file_path, zip_file_name) + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(extract_path) + os.remove(zip_file_path) + + +def model_download_pipeline(url: str): + try: + verify = download_from_url(url) + if verify == "downloaded": + extract_folder_path = "" + for filename in os.listdir(zips_path): + if filename.endswith(".zip"): + zipfile_path = os.path.join(zips_path, filename) + print("Proceeding with the extraction...") + + model_zip = os.path.basename(zipfile_path) + model_name = format_title(model_zip.split(".zip")[0]) + extract_folder_path = os.path.join( + "logs", + os.path.normpath(model_name), + ) + success = extract_and_show_progress( + zipfile_path, extract_folder_path + ) + + macosx_path = os.path.join(extract_folder_path, "__MACOSX") + if os.path.exists(macosx_path): + shutil.rmtree(macosx_path) + + subfolders = [ + f + for f in os.listdir(extract_folder_path) + if os.path.isdir(os.path.join(extract_folder_path, f)) + ] + if len(subfolders) == 1: + subfolder_path = os.path.join( + extract_folder_path, subfolders[0] + ) + for item in os.listdir(subfolder_path): + s = os.path.join(subfolder_path, item) + d = os.path.join(extract_folder_path, item) + shutil.move(s, d) + os.rmdir(subfolder_path) + + for item in os.listdir(extract_folder_path): + if ".pth" in item: + file_name = item.split(".pth")[0] + if file_name != model_name: + os.rename( + os.path.join(extract_folder_path, item), + os.path.join( + extract_folder_path, model_name + ".pth" + ), + ) + else: + if "v2" not in item: + if "_nprobe_1_" in item and "_v1" in item: + file_name = item.split("_nprobe_1_")[1].split( + "_v1" + )[0] + if file_name != model_name: + new_file_name = ( + item.split("_nprobe_1_")[0] + + "_nprobe_1_" + + model_name + + "_v1" + ) + os.rename( + os.path.join(extract_folder_path, item), + os.path.join( + extract_folder_path, + new_file_name + ".index", + ), + ) + else: + if "_nprobe_1_" in item and "_v2" in item: + file_name = item.split("_nprobe_1_")[1].split( + "_v2" + )[0] + if file_name != model_name: + new_file_name = ( + item.split("_nprobe_1_")[0] + + "_nprobe_1_" + + model_name + + "_v2" + ) + os.rename( + os.path.join(extract_folder_path, item), + os.path.join( + extract_folder_path, + new_file_name + ".index", + ), + ) + + if success: + print(f"Model {model_name} downloaded!") + else: + print(f"Error downloading {model_name}") + return "Error" + if extract_folder_path == "": + print("Zip file was not found.") + return "Error" + result = search_pth_index(extract_folder_path) + return result + else: + return "Error" + except Exception as error: + print(f"An unexpected error occurred: {error}") + return "Error" diff --git a/rvc/lib/tools/prerequisites_download.py b/rvc/lib/tools/prerequisites_download.py new file mode 100644 index 00000000..6eb24ea6 --- /dev/null +++ b/rvc/lib/tools/prerequisites_download.py @@ -0,0 +1,104 @@ +import os +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +import requests + +url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources" + +# Define the file lists +models_list = [("predictors/", ["rmvpe.pt", "fcpe.pt"])] +embedders_list = [("embedders/contentvec/", ["pytorch_model.bin", "config.json"])] +executables_list = [ + ("", ["ffmpeg.exe", "ffprobe.exe"]), +] + +folder_mapping_list = { + "embedders/contentvec/": "rvc/models/embedders/contentvec/", + "predictors/": "rvc/models/predictors/", + "formant/": "rvc/models/formant/", +} + + +def get_file_size_all(file_list): + """ + Calculate the total size of files to be downloaded, regardless of local existence. + """ + total_size = 0 + for remote_folder, files in file_list: + # Use the mapping if available; otherwise, use an empty local folder + local_folder = folder_mapping_list.get(remote_folder, "") + for file in files: + url = f"{url_base}/{remote_folder}{file}" + response = requests.head(url) + total_size += int(response.headers.get("content-length", 0)) + return total_size + + +def download_file(url, destination_path, global_bar): + """ + Download a file from the given URL to the specified destination path, + updating the global progress bar as data is downloaded. + """ + dir_name = os.path.dirname(destination_path) + if dir_name: + os.makedirs(dir_name, exist_ok=True) + response = requests.get(url, stream=True) + block_size = 1024 + with open(destination_path, "wb") as file: + for data in response.iter_content(block_size): + file.write(data) + global_bar.update(len(data)) + + +def download_mapping_files(file_mapping_list, global_bar): + """ + Download all files in the provided file mapping list using a thread pool executor, + and update the global progress bar as downloads progress. + This version downloads all files regardless of whether they already exist. + """ + with ThreadPoolExecutor() as executor: + futures = [] + for remote_folder, file_list in file_mapping_list: + local_folder = folder_mapping_list.get(remote_folder, "") + for file in file_list: + destination_path = os.path.join(local_folder, file) + url = f"{url_base}/{remote_folder}{file}" + futures.append( + executor.submit(download_file, url, destination_path, global_bar) + ) + for future in futures: + future.result() + + +def calculate_total_size(models, exe): + """ + Calculate the total size of all files to be downloaded based on selected categories. + """ + total_size = 0 + if models: + total_size += get_file_size_all(models_list) + total_size += get_file_size_all(embedders_list) + if exe and os.name == "nt": + total_size += get_file_size_all(executables_list) + return total_size + + +def prerequisites_download_pipeline(models, exe): + """ + Manage the download pipeline for different categories of files. + """ + total_size = calculate_total_size(models, exe) + if total_size > 0: + with tqdm( + total=total_size, unit="iB", unit_scale=True, desc="Downloading all files" + ) as global_bar: + if models: + download_mapping_files(models_list, global_bar) + download_mapping_files(embedders_list, global_bar) + if exe: + if os.name == "nt": + download_mapping_files(executables_list, global_bar) + else: + print("No executables needed for non-Windows systems.") + else: + print("No files to download.") diff --git a/rvc/lib/tools/pretrained_selector.py b/rvc/lib/tools/pretrained_selector.py new file mode 100644 index 00000000..e982fac5 --- /dev/null +++ b/rvc/lib/tools/pretrained_selector.py @@ -0,0 +1,63 @@ +def pretrained_selector(pitch_guidance): + if pitch_guidance == True: + return { + "v1": { + 32000: ( + "rvc/models/pretraineds/pretrained_v1/f0G32k.pth", + "rvc/models/pretraineds/pretrained_v1/f0D32k.pth", + ), + 40000: ( + "rvc/models/pretraineds/pretrained_v1/f0G40k.pth", + "rvc/models/pretraineds/pretrained_v1/f0D40k.pth", + ), + 48000: ( + "rvc/models/pretraineds/pretrained_v1/f0G48k.pth", + "rvc/models/pretraineds/pretrained_v1/f0D48k.pth", + ), + }, + "v2": { + 32000: ( + "rvc/models/pretraineds/pretrained_v2/f0G32k.pth", + "rvc/models/pretraineds/pretrained_v2/f0D32k.pth", + ), + 40000: ( + "rvc/models/pretraineds/pretrained_v2/f0G40k.pth", + "rvc/models/pretraineds/pretrained_v2/f0D40k.pth", + ), + 48000: ( + "rvc/models/pretraineds/pretrained_v2/f0G48k.pth", + "rvc/models/pretraineds/pretrained_v2/f0D48k.pth", + ), + }, + } + elif pitch_guidance == False: + return { + "v1": { + 32000: ( + "rvc/models/pretraineds/pretrained_v1/G32k.pth", + "rvc/models/pretraineds/pretrained_v1/D32k.pth", + ), + 40000: ( + "rvc/models/pretraineds/pretrained_v1/G40k.pth", + "rvc/models/pretraineds/pretrained_v1/D40k.pth", + ), + 48000: ( + "rvc/models/pretraineds/pretrained_v1/G48k.pth", + "rvc/models/pretraineds/pretrained_v1/D48k.pth", + ), + }, + "v2": { + 32000: ( + "rvc/models/pretraineds/pretrained_v2/G32k.pth", + "rvc/models/pretraineds/pretrained_v2/D32k.pth", + ), + 40000: ( + "rvc/models/pretraineds/pretrained_v2/G40k.pth", + "rvc/models/pretraineds/pretrained_v2/D40k.pth", + ), + 48000: ( + "rvc/models/pretraineds/pretrained_v2/G48k.pth", + "rvc/models/pretraineds/pretrained_v2/D48k.pth", + ), + }, + } diff --git a/rvc/lib/tools/split_audio.py b/rvc/lib/tools/split_audio.py new file mode 100644 index 00000000..65e7ba94 --- /dev/null +++ b/rvc/lib/tools/split_audio.py @@ -0,0 +1,56 @@ +import numpy as np +import librosa + + +def process_audio(audio, sr=16000, silence_thresh=-60, min_silence_len=250): + """ + Splits an audio signal into segments using a fixed frame size and hop size. + + Parameters: + - audio (np.ndarray): The audio signal to split. + - sr (int): The sample rate of the input audio (default is 16000). + - silence_thresh (int): Silence threshold (default =-60dB) + - min_silence_len (int): Minimum silence duration (default 250ms). + + Returns: + - list of np.ndarray: A list of audio segments. + - np.ndarray: The intervals where the audio was split. + """ + frame_length = int(min_silence_len / 1000 * sr) + hop_length = frame_length // 2 + intervals = librosa.effects.split( + audio, top_db=-silence_thresh, frame_length=frame_length, hop_length=hop_length + ) + audio_segments = [audio[start:end] for start, end in intervals] + + return audio_segments, intervals + + +def merge_audio(audio_segments, intervals, sr_orig, sr_new): + """ + Merges audio segments back into a single audio signal, filling gaps with silence. + + Parameters: + - audio_segments (list of np.ndarray): The non-silent audio segments. + - intervals (np.ndarray): The intervals used for splitting the original audio. + - sr_orig (int): The sample rate of the original audio + - sr_new (int): The sample rate of the model + + Returns: + - np.ndarray: The merged audio signal with silent gaps restored. + """ + sr_ratio = sr_new / sr_orig if sr_new > sr_orig else 1.0 + + merged_audio = np.zeros( + int(intervals[0][0] * sr_ratio if intervals[0][0] > 0 else 0), + dtype=audio_segments[0].dtype, + ) + + merged_audio = np.concatenate((merged_audio, audio_segments[0])) + + for i in range(1, len(intervals)): + silence_duration = int((intervals[i][0] - intervals[i - 1][1]) * sr_ratio) + silence = np.zeros(silence_duration, dtype=audio_segments[0].dtype) + merged_audio = np.concatenate((merged_audio, silence, audio_segments[i])) + + return merged_audio diff --git a/rvc/lib/tools/tts.py b/rvc/lib/tools/tts.py new file mode 100644 index 00000000..9b30c6e1 --- /dev/null +++ b/rvc/lib/tools/tts.py @@ -0,0 +1,29 @@ +import sys +import asyncio +import edge_tts +import os + + +async def main(): + # Parse command line arguments + tts_file = str(sys.argv[1]) + text = str(sys.argv[2]) + voice = str(sys.argv[3]) + rate = int(sys.argv[4]) + output_file = str(sys.argv[5]) + + rates = f"+{rate}%" if rate >= 0 else f"{rate}%" + if tts_file and os.path.exists(tts_file): + text = "" + try: + with open(tts_file, "r", encoding="utf-8") as file: + text = file.read() + except UnicodeDecodeError: + with open(tts_file, "r") as file: + text = file.read() + await edge_tts.Communicate(text, voice, rate=rates).save(output_file) + print(f"TTS with {voice} completed. Output TTS file: '{output_file}'") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/rvc/lib/tools/tts_voices.json b/rvc/lib/tools/tts_voices.json new file mode 100644 index 00000000..b76cf447 --- /dev/null +++ b/rvc/lib/tools/tts_voices.json @@ -0,0 +1,5748 @@ +[ + { + "Name": "Microsoft Server Speech Text to Speech Voice (af-ZA, AdriNeural)", + "ShortName": "af-ZA-AdriNeural", + "Gender": "Female", + "Locale": "af-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Adri Online (Natural) - Afrikaans (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (af-ZA, WillemNeural)", + "ShortName": "af-ZA-WillemNeural", + "Gender": "Male", + "Locale": "af-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Willem Online (Natural) - Afrikaans (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sq-AL, AnilaNeural)", + "ShortName": "sq-AL-AnilaNeural", + "Gender": "Female", + "Locale": "sq-AL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anila Online (Natural) - Albanian (Albania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sq-AL, IlirNeural)", + "ShortName": "sq-AL-IlirNeural", + "Gender": "Male", + "Locale": "sq-AL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ilir Online (Natural) - Albanian (Albania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (am-ET, AmehaNeural)", + "ShortName": "am-ET-AmehaNeural", + "Gender": "Male", + "Locale": "am-ET", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ameha Online (Natural) - Amharic (Ethiopia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (am-ET, MekdesNeural)", + "ShortName": "am-ET-MekdesNeural", + "Gender": "Female", + "Locale": "am-ET", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mekdes Online (Natural) - Amharic (Ethiopia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-DZ, AminaNeural)", + "ShortName": "ar-DZ-AminaNeural", + "Gender": "Female", + "Locale": "ar-DZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amina Online (Natural) - Arabic (Algeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-DZ, IsmaelNeural)", + "ShortName": "ar-DZ-IsmaelNeural", + "Gender": "Male", + "Locale": "ar-DZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ismael Online (Natural) - Arabic (Algeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-BH, AliNeural)", + "ShortName": "ar-BH-AliNeural", + "Gender": "Male", + "Locale": "ar-BH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ali Online (Natural) - Arabic (Bahrain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-BH, LailaNeural)", + "ShortName": "ar-BH-LailaNeural", + "Gender": "Female", + "Locale": "ar-BH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Laila Online (Natural) - Arabic (Bahrain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-EG, SalmaNeural)", + "ShortName": "ar-EG-SalmaNeural", + "Gender": "Female", + "Locale": "ar-EG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salma Online (Natural) - Arabic (Egypt)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-EG, ShakirNeural)", + "ShortName": "ar-EG-ShakirNeural", + "Gender": "Male", + "Locale": "ar-EG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Shakir Online (Natural) - Arabic (Egypt)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-IQ, BasselNeural)", + "ShortName": "ar-IQ-BasselNeural", + "Gender": "Male", + "Locale": "ar-IQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bassel Online (Natural) - Arabic (Iraq)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-IQ, RanaNeural)", + "ShortName": "ar-IQ-RanaNeural", + "Gender": "Female", + "Locale": "ar-IQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rana Online (Natural) - Arabic (Iraq)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-JO, SanaNeural)", + "ShortName": "ar-JO-SanaNeural", + "Gender": "Female", + "Locale": "ar-JO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sana Online (Natural) - Arabic (Jordan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-JO, TaimNeural)", + "ShortName": "ar-JO-TaimNeural", + "Gender": "Male", + "Locale": "ar-JO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Taim Online (Natural) - Arabic (Jordan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-KW, FahedNeural)", + "ShortName": "ar-KW-FahedNeural", + "Gender": "Male", + "Locale": "ar-KW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fahed Online (Natural) - Arabic (Kuwait)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-KW, NouraNeural)", + "ShortName": "ar-KW-NouraNeural", + "Gender": "Female", + "Locale": "ar-KW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noura Online (Natural) - Arabic (Kuwait)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LB, LaylaNeural)", + "ShortName": "ar-LB-LaylaNeural", + "Gender": "Female", + "Locale": "ar-LB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Layla Online (Natural) - Arabic (Lebanon)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LB, RamiNeural)", + "ShortName": "ar-LB-RamiNeural", + "Gender": "Male", + "Locale": "ar-LB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rami Online (Natural) - Arabic (Lebanon)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LY, ImanNeural)", + "ShortName": "ar-LY-ImanNeural", + "Gender": "Female", + "Locale": "ar-LY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Iman Online (Natural) - Arabic (Libya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LY, OmarNeural)", + "ShortName": "ar-LY-OmarNeural", + "Gender": "Male", + "Locale": "ar-LY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Omar Online (Natural) - Arabic (Libya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-MA, JamalNeural)", + "ShortName": "ar-MA-JamalNeural", + "Gender": "Male", + "Locale": "ar-MA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jamal Online (Natural) - Arabic (Morocco)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-MA, MounaNeural)", + "ShortName": "ar-MA-MounaNeural", + "Gender": "Female", + "Locale": "ar-MA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mouna Online (Natural) - Arabic (Morocco)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-OM, AbdullahNeural)", + "ShortName": "ar-OM-AbdullahNeural", + "Gender": "Male", + "Locale": "ar-OM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Abdullah Online (Natural) - Arabic (Oman)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-OM, AyshaNeural)", + "ShortName": "ar-OM-AyshaNeural", + "Gender": "Female", + "Locale": "ar-OM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aysha Online (Natural) - Arabic (Oman)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-QA, AmalNeural)", + "ShortName": "ar-QA-AmalNeural", + "Gender": "Female", + "Locale": "ar-QA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amal Online (Natural) - Arabic (Qatar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-QA, MoazNeural)", + "ShortName": "ar-QA-MoazNeural", + "Gender": "Male", + "Locale": "ar-QA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Moaz Online (Natural) - Arabic (Qatar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SA, HamedNeural)", + "ShortName": "ar-SA-HamedNeural", + "Gender": "Male", + "Locale": "ar-SA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hamed Online (Natural) - Arabic (Saudi Arabia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SA, ZariyahNeural)", + "ShortName": "ar-SA-ZariyahNeural", + "Gender": "Female", + "Locale": "ar-SA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zariyah Online (Natural) - Arabic (Saudi Arabia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SY, AmanyNeural)", + "ShortName": "ar-SY-AmanyNeural", + "Gender": "Female", + "Locale": "ar-SY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amany Online (Natural) - Arabic (Syria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SY, LaithNeural)", + "ShortName": "ar-SY-LaithNeural", + "Gender": "Male", + "Locale": "ar-SY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Laith Online (Natural) - Arabic (Syria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-TN, HediNeural)", + "ShortName": "ar-TN-HediNeural", + "Gender": "Male", + "Locale": "ar-TN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hedi Online (Natural) - Arabic (Tunisia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-TN, ReemNeural)", + "ShortName": "ar-TN-ReemNeural", + "Gender": "Female", + "Locale": "ar-TN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Reem Online (Natural) - Arabic (Tunisia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-AE, FatimaNeural)", + "ShortName": "ar-AE-FatimaNeural", + "Gender": "Female", + "Locale": "ar-AE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fatima Online (Natural) - Arabic (United Arab Emirates)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-AE, HamdanNeural)", + "ShortName": "ar-AE-HamdanNeural", + "Gender": "Male", + "Locale": "ar-AE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hamdan Online (Natural) - Arabic (United Arab Emirates)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-YE, MaryamNeural)", + "ShortName": "ar-YE-MaryamNeural", + "Gender": "Female", + "Locale": "ar-YE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maryam Online (Natural) - Arabic (Yemen)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-YE, SalehNeural)", + "ShortName": "ar-YE-SalehNeural", + "Gender": "Male", + "Locale": "ar-YE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Saleh Online (Natural) - Arabic (Yemen)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (az-AZ, BabekNeural)", + "ShortName": "az-AZ-BabekNeural", + "Gender": "Male", + "Locale": "az-AZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Babek Online (Natural) - Azerbaijani (Azerbaijan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (az-AZ, BanuNeural)", + "ShortName": "az-AZ-BanuNeural", + "Gender": "Female", + "Locale": "az-AZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Banu Online (Natural) - Azerbaijani (Azerbaijan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-BD, NabanitaNeural)", + "ShortName": "bn-BD-NabanitaNeural", + "Gender": "Female", + "Locale": "bn-BD", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nabanita Online (Natural) - Bangla (Bangladesh)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-BD, PradeepNeural)", + "ShortName": "bn-BD-PradeepNeural", + "Gender": "Male", + "Locale": "bn-BD", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pradeep Online (Natural) - Bangla (Bangladesh)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-IN, BashkarNeural)", + "ShortName": "bn-IN-BashkarNeural", + "Gender": "Male", + "Locale": "bn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bashkar Online (Natural) - Bangla (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-IN, TanishaaNeural)", + "ShortName": "bn-IN-TanishaaNeural", + "Gender": "Female", + "Locale": "bn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tanishaa Online (Natural) - Bengali (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bs-BA, GoranNeural)", + "ShortName": "bs-BA-GoranNeural", + "Gender": "Male", + "Locale": "bs-BA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Goran Online (Natural) - Bosnian (Bosnia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bs-BA, VesnaNeural)", + "ShortName": "bs-BA-VesnaNeural", + "Gender": "Female", + "Locale": "bs-BA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Vesna Online (Natural) - Bosnian (Bosnia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bg-BG, BorislavNeural)", + "ShortName": "bg-BG-BorislavNeural", + "Gender": "Male", + "Locale": "bg-BG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Borislav Online (Natural) - Bulgarian (Bulgaria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bg-BG, KalinaNeural)", + "ShortName": "bg-BG-KalinaNeural", + "Gender": "Female", + "Locale": "bg-BG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kalina Online (Natural) - Bulgarian (Bulgaria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (my-MM, NilarNeural)", + "ShortName": "my-MM-NilarNeural", + "Gender": "Female", + "Locale": "my-MM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nilar Online (Natural) - Burmese (Myanmar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (my-MM, ThihaNeural)", + "ShortName": "my-MM-ThihaNeural", + "Gender": "Male", + "Locale": "my-MM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thiha Online (Natural) - Burmese (Myanmar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ca-ES, EnricNeural)", + "ShortName": "ca-ES-EnricNeural", + "Gender": "Male", + "Locale": "ca-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Enric Online (Natural) - Catalan (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ca-ES, JoanaNeural)", + "ShortName": "ca-ES-JoanaNeural", + "Gender": "Female", + "Locale": "ca-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Joana Online (Natural) - Catalan (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuGaaiNeural)", + "ShortName": "zh-HK-HiuGaaiNeural", + "Gender": "Female", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HiuGaai Online (Natural) - Chinese (Cantonese Traditional)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuMaanNeural)", + "ShortName": "zh-HK-HiuMaanNeural", + "Gender": "Female", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HiuMaan Online (Natural) - Chinese (Hong Kong)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, WanLungNeural)", + "ShortName": "zh-HK-WanLungNeural", + "Gender": "Male", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft WanLung Online (Natural) - Chinese (Hong Kong)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)", + "ShortName": "zh-CN-XiaoxiaoNeural", + "Gender": "Female", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoxiao Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Warm" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoyiNeural)", + "ShortName": "zh-CN-XiaoyiNeural", + "Gender": "Female", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoyi Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Novel" + ], + "VoicePersonalities": [ + "Lively" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunjianNeural)", + "ShortName": "zh-CN-YunjianNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunjian Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Sports", + " Novel" + ], + "VoicePersonalities": [ + "Passion" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiNeural)", + "ShortName": "zh-CN-YunxiNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunxi Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Novel" + ], + "VoicePersonalities": [ + "Lively", + "Sunshine" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiaNeural)", + "ShortName": "zh-CN-YunxiaNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunxia Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Novel" + ], + "VoicePersonalities": [ + "Cute" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunyangNeural)", + "ShortName": "zh-CN-YunyangNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunyang Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News" + ], + "VoicePersonalities": [ + "Professional", + "Reliable" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN-liaoning, XiaobeiNeural)", + "ShortName": "zh-CN-liaoning-XiaobeiNeural", + "Gender": "Female", + "Locale": "zh-CN-liaoning", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaobei Online (Natural) - Chinese (Northeastern Mandarin)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Dialect" + ], + "VoicePersonalities": [ + "Humorous" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoChenNeural)", + "ShortName": "zh-TW-HsiaoChenNeural", + "Gender": "Female", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HsiaoChen Online (Natural) - Chinese (Taiwan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, YunJheNeural)", + "ShortName": "zh-TW-YunJheNeural", + "Gender": "Male", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft YunJhe Online (Natural) - Chinese (Taiwan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoYuNeural)", + "ShortName": "zh-TW-HsiaoYuNeural", + "Gender": "Female", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HsiaoYu Online (Natural) - Chinese (Taiwanese Mandarin)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN-shaanxi, XiaoniNeural)", + "ShortName": "zh-CN-shaanxi-XiaoniNeural", + "Gender": "Female", + "Locale": "zh-CN-shaanxi", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoni Online (Natural) - Chinese (Zhongyuan Mandarin Shaanxi)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Dialect" + ], + "VoicePersonalities": [ + "Bright" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hr-HR, GabrijelaNeural)", + "ShortName": "hr-HR-GabrijelaNeural", + "Gender": "Female", + "Locale": "hr-HR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gabrijela Online (Natural) - Croatian (Croatia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hr-HR, SreckoNeural)", + "ShortName": "hr-HR-SreckoNeural", + "Gender": "Male", + "Locale": "hr-HR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Srecko Online (Natural) - Croatian (Croatia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cs-CZ, AntoninNeural)", + "ShortName": "cs-CZ-AntoninNeural", + "Gender": "Male", + "Locale": "cs-CZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antonin Online (Natural) - Czech (Czech)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cs-CZ, VlastaNeural)", + "ShortName": "cs-CZ-VlastaNeural", + "Gender": "Female", + "Locale": "cs-CZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Vlasta Online (Natural) - Czech (Czech)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (da-DK, ChristelNeural)", + "ShortName": "da-DK-ChristelNeural", + "Gender": "Female", + "Locale": "da-DK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Christel Online (Natural) - Danish (Denmark)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (da-DK, JeppeNeural)", + "ShortName": "da-DK-JeppeNeural", + "Gender": "Male", + "Locale": "da-DK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jeppe Online (Natural) - Danish (Denmark)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-BE, ArnaudNeural)", + "ShortName": "nl-BE-ArnaudNeural", + "Gender": "Male", + "Locale": "nl-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Arnaud Online (Natural) - Dutch (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-BE, DenaNeural)", + "ShortName": "nl-BE-DenaNeural", + "Gender": "Female", + "Locale": "nl-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dena Online (Natural) - Dutch (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, ColetteNeural)", + "ShortName": "nl-NL-ColetteNeural", + "Gender": "Female", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Colette Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, FennaNeural)", + "ShortName": "nl-NL-FennaNeural", + "Gender": "Female", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fenna Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, MaartenNeural)", + "ShortName": "nl-NL-MaartenNeural", + "Gender": "Male", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maarten Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-AU, NatashaNeural)", + "ShortName": "en-AU-NatashaNeural", + "Gender": "Female", + "Locale": "en-AU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Natasha Online (Natural) - English (Australia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-AU, WilliamNeural)", + "ShortName": "en-AU-WilliamNeural", + "Gender": "Male", + "Locale": "en-AU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft William Online (Natural) - English (Australia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-CA, ClaraNeural)", + "ShortName": "en-CA-ClaraNeural", + "Gender": "Female", + "Locale": "en-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Clara Online (Natural) - English (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-CA, LiamNeural)", + "ShortName": "en-CA-LiamNeural", + "Gender": "Male", + "Locale": "en-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Liam Online (Natural) - English (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-HK, SamNeural)", + "ShortName": "en-HK-SamNeural", + "Gender": "Male", + "Locale": "en-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sam Online (Natural) - English (Hongkong)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-HK, YanNeural)", + "ShortName": "en-HK-YanNeural", + "Gender": "Female", + "Locale": "en-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yan Online (Natural) - English (Hongkong)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaExpressiveNeural)", + "ShortName": "en-IN-NeerjaExpressiveNeural", + "Gender": "Female", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Neerja Online (Natural) - English (India) (Preview)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaNeural)", + "ShortName": "en-IN-NeerjaNeural", + "Gender": "Female", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Neerja Online (Natural) - English (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, PrabhatNeural)", + "ShortName": "en-IN-PrabhatNeural", + "Gender": "Male", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Prabhat Online (Natural) - English (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IE, ConnorNeural)", + "ShortName": "en-IE-ConnorNeural", + "Gender": "Male", + "Locale": "en-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Connor Online (Natural) - English (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IE, EmilyNeural)", + "ShortName": "en-IE-EmilyNeural", + "Gender": "Female", + "Locale": "en-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emily Online (Natural) - English (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-KE, AsiliaNeural)", + "ShortName": "en-KE-AsiliaNeural", + "Gender": "Female", + "Locale": "en-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Asilia Online (Natural) - English (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-KE, ChilembaNeural)", + "ShortName": "en-KE-ChilembaNeural", + "Gender": "Male", + "Locale": "en-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Chilemba Online (Natural) - English (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NZ, MitchellNeural)", + "ShortName": "en-NZ-MitchellNeural", + "Gender": "Male", + "Locale": "en-NZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mitchell Online (Natural) - English (New Zealand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NZ, MollyNeural)", + "ShortName": "en-NZ-MollyNeural", + "Gender": "Female", + "Locale": "en-NZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Molly Online (Natural) - English (New Zealand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NG, AbeoNeural)", + "ShortName": "en-NG-AbeoNeural", + "Gender": "Male", + "Locale": "en-NG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Abeo Online (Natural) - English (Nigeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NG, EzinneNeural)", + "ShortName": "en-NG-EzinneNeural", + "Gender": "Female", + "Locale": "en-NG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ezinne Online (Natural) - English (Nigeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-PH, JamesNeural)", + "ShortName": "en-PH-JamesNeural", + "Gender": "Male", + "Locale": "en-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft James Online (Natural) - English (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-PH, RosaNeural)", + "ShortName": "en-PH-RosaNeural", + "Gender": "Female", + "Locale": "en-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rosa Online (Natural) - English (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-SG, LunaNeural)", + "ShortName": "en-SG-LunaNeural", + "Gender": "Female", + "Locale": "en-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luna Online (Natural) - English (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-SG, WayneNeural)", + "ShortName": "en-SG-WayneNeural", + "Gender": "Male", + "Locale": "en-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Wayne Online (Natural) - English (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-ZA, LeahNeural)", + "ShortName": "en-ZA-LeahNeural", + "Gender": "Female", + "Locale": "en-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leah Online (Natural) - English (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-ZA, LukeNeural)", + "ShortName": "en-ZA-LukeNeural", + "Gender": "Male", + "Locale": "en-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luke Online (Natural) - English (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-TZ, ElimuNeural)", + "ShortName": "en-TZ-ElimuNeural", + "Gender": "Male", + "Locale": "en-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elimu Online (Natural) - English (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-TZ, ImaniNeural)", + "ShortName": "en-TZ-ImaniNeural", + "Gender": "Female", + "Locale": "en-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Imani Online (Natural) - English (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, LibbyNeural)", + "ShortName": "en-GB-LibbyNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Libby Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, MaisieNeural)", + "ShortName": "en-GB-MaisieNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maisie Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, RyanNeural)", + "ShortName": "en-GB-RyanNeural", + "Gender": "Male", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ryan Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, SoniaNeural)", + "ShortName": "en-GB-SoniaNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sonia Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, ThomasNeural)", + "ShortName": "en-GB-ThomasNeural", + "Gender": "Male", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thomas Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AvaMultilingualNeural)", + "ShortName": "en-US-AvaMultilingualNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft AvaMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Expressive", + "Caring", + "Pleasant", + "Friendly" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AndrewMultilingualNeural)", + "ShortName": "en-US-AndrewMultilingualNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft AndrewMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Warm", + "Confident", + "Authentic", + "Honest" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EmmaMultilingualNeural)", + "ShortName": "en-US-EmmaMultilingualNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft EmmaMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Cheerful", + "Clear", + "Conversational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, BrianMultilingualNeural)", + "ShortName": "en-US-BrianMultilingualNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft BrianMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Approachable", + "Casual", + "Sincere" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AvaNeural)", + "ShortName": "en-US-AvaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ava Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Expressive", + "Caring", + "Pleasant", + "Friendly" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AndrewNeural)", + "ShortName": "en-US-AndrewNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andrew Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Warm", + "Confident", + "Authentic", + "Honest" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EmmaNeural)", + "ShortName": "en-US-EmmaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emma Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Cheerful", + "Clear", + "Conversational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, BrianNeural)", + "ShortName": "en-US-BrianNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Brian Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Approachable", + "Casual", + "Sincere" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AnaNeural)", + "ShortName": "en-US-AnaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ana Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Conversation" + ], + "VoicePersonalities": [ + "Cute" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)", + "ShortName": "en-US-AriaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aria Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Positive", + "Confident" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, ChristopherNeural)", + "ShortName": "en-US-ChristopherNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Christopher Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Reliable", + "Authority" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EricNeural)", + "ShortName": "en-US-EricNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eric Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Rational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, GuyNeural)", + "ShortName": "en-US-GuyNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Guy Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Passion" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)", + "ShortName": "en-US-JennyNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jenny Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Considerate", + "Comfort" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, MichelleNeural)", + "ShortName": "en-US-MichelleNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Michelle Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Friendly", + "Pleasant" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, RogerNeural)", + "ShortName": "en-US-RogerNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roger Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Lively" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, SteffanNeural)", + "ShortName": "en-US-SteffanNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Steffan Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Rational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (et-EE, AnuNeural)", + "ShortName": "et-EE-AnuNeural", + "Gender": "Female", + "Locale": "et-EE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anu Online (Natural) - Estonian (Estonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (et-EE, KertNeural)", + "ShortName": "et-EE-KertNeural", + "Gender": "Male", + "Locale": "et-EE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kert Online (Natural) - Estonian (Estonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fil-PH, AngeloNeural)", + "ShortName": "fil-PH-AngeloNeural", + "Gender": "Male", + "Locale": "fil-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Angelo Online (Natural) - Filipino (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fil-PH, BlessicaNeural)", + "ShortName": "fil-PH-BlessicaNeural", + "Gender": "Female", + "Locale": "fil-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Blessica Online (Natural) - Filipino (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fi-FI, HarriNeural)", + "ShortName": "fi-FI-HarriNeural", + "Gender": "Male", + "Locale": "fi-FI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Harri Online (Natural) - Finnish (Finland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fi-FI, NooraNeural)", + "ShortName": "fi-FI-NooraNeural", + "Gender": "Female", + "Locale": "fi-FI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noora Online (Natural) - Finnish (Finland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-BE, CharlineNeural)", + "ShortName": "fr-BE-CharlineNeural", + "Gender": "Female", + "Locale": "fr-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Charline Online (Natural) - French (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-BE, GerardNeural)", + "ShortName": "fr-BE-GerardNeural", + "Gender": "Male", + "Locale": "fr-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gerard Online (Natural) - French (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, ThierryNeural)", + "ShortName": "fr-CA-ThierryNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thierry Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, AntoineNeural)", + "ShortName": "fr-CA-AntoineNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antoine Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, JeanNeural)", + "ShortName": "fr-CA-JeanNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jean Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, SylvieNeural)", + "ShortName": "fr-CA-SylvieNeural", + "Gender": "Female", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sylvie Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, VivienneMultilingualNeural)", + "ShortName": "fr-FR-VivienneMultilingualNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft VivienneMultilingual Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, RemyMultilingualNeural)", + "ShortName": "fr-FR-RemyMultilingualNeural", + "Gender": "Male", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft RemyMultilingual Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, DeniseNeural)", + "ShortName": "fr-FR-DeniseNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Denise Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, EloiseNeural)", + "ShortName": "fr-FR-EloiseNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eloise Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, HenriNeural)", + "ShortName": "fr-FR-HenriNeural", + "Gender": "Male", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Henri Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CH, ArianeNeural)", + "ShortName": "fr-CH-ArianeNeural", + "Gender": "Female", + "Locale": "fr-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ariane Online (Natural) - French (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CH, FabriceNeural)", + "ShortName": "fr-CH-FabriceNeural", + "Gender": "Male", + "Locale": "fr-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fabrice Online (Natural) - French (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gl-ES, RoiNeural)", + "ShortName": "gl-ES-RoiNeural", + "Gender": "Male", + "Locale": "gl-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roi Online (Natural) - Galician (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gl-ES, SabelaNeural)", + "ShortName": "gl-ES-SabelaNeural", + "Gender": "Female", + "Locale": "gl-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sabela Online (Natural) - Galician (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ka-GE, EkaNeural)", + "ShortName": "ka-GE-EkaNeural", + "Gender": "Female", + "Locale": "ka-GE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eka Online (Natural) - Georgian (Georgia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ka-GE, GiorgiNeural)", + "ShortName": "ka-GE-GiorgiNeural", + "Gender": "Male", + "Locale": "ka-GE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Giorgi Online (Natural) - Georgian (Georgia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-AT, IngridNeural)", + "ShortName": "de-AT-IngridNeural", + "Gender": "Female", + "Locale": "de-AT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ingrid Online (Natural) - German (Austria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-AT, JonasNeural)", + "ShortName": "de-AT-JonasNeural", + "Gender": "Male", + "Locale": "de-AT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jonas Online (Natural) - German (Austria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, SeraphinaMultilingualNeural)", + "ShortName": "de-DE-SeraphinaMultilingualNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft SeraphinaMultilingual Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, FlorianMultilingualNeural)", + "ShortName": "de-DE-FlorianMultilingualNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft FlorianMultilingual Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, AmalaNeural)", + "ShortName": "de-DE-AmalaNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amala Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, ConradNeural)", + "ShortName": "de-DE-ConradNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Conrad Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, KatjaNeural)", + "ShortName": "de-DE-KatjaNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Katja Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, KillianNeural)", + "ShortName": "de-DE-KillianNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Killian Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-CH, JanNeural)", + "ShortName": "de-CH-JanNeural", + "Gender": "Male", + "Locale": "de-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jan Online (Natural) - German (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-CH, LeniNeural)", + "ShortName": "de-CH-LeniNeural", + "Gender": "Female", + "Locale": "de-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leni Online (Natural) - German (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (el-GR, AthinaNeural)", + "ShortName": "el-GR-AthinaNeural", + "Gender": "Female", + "Locale": "el-GR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Athina Online (Natural) - Greek (Greece)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (el-GR, NestorasNeural)", + "ShortName": "el-GR-NestorasNeural", + "Gender": "Male", + "Locale": "el-GR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nestoras Online (Natural) - Greek (Greece)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gu-IN, DhwaniNeural)", + "ShortName": "gu-IN-DhwaniNeural", + "Gender": "Female", + "Locale": "gu-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dhwani Online (Natural) - Gujarati (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gu-IN, NiranjanNeural)", + "ShortName": "gu-IN-NiranjanNeural", + "Gender": "Male", + "Locale": "gu-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Niranjan Online (Natural) - Gujarati (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (he-IL, AvriNeural)", + "ShortName": "he-IL-AvriNeural", + "Gender": "Male", + "Locale": "he-IL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Avri Online (Natural) - Hebrew (Israel)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (he-IL, HilaNeural)", + "ShortName": "he-IL-HilaNeural", + "Gender": "Female", + "Locale": "he-IL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hila Online (Natural) - Hebrew (Israel)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hi-IN, MadhurNeural)", + "ShortName": "hi-IN-MadhurNeural", + "Gender": "Male", + "Locale": "hi-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Madhur Online (Natural) - Hindi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hi-IN, SwaraNeural)", + "ShortName": "hi-IN-SwaraNeural", + "Gender": "Female", + "Locale": "hi-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Swara Online (Natural) - Hindi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hu-HU, NoemiNeural)", + "ShortName": "hu-HU-NoemiNeural", + "Gender": "Female", + "Locale": "hu-HU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noemi Online (Natural) - Hungarian (Hungary)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hu-HU, TamasNeural)", + "ShortName": "hu-HU-TamasNeural", + "Gender": "Male", + "Locale": "hu-HU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tamas Online (Natural) - Hungarian (Hungary)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (is-IS, GudrunNeural)", + "ShortName": "is-IS-GudrunNeural", + "Gender": "Female", + "Locale": "is-IS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gudrun Online (Natural) - Icelandic (Iceland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (is-IS, GunnarNeural)", + "ShortName": "is-IS-GunnarNeural", + "Gender": "Male", + "Locale": "is-IS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gunnar Online (Natural) - Icelandic (Iceland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (id-ID, ArdiNeural)", + "ShortName": "id-ID-ArdiNeural", + "Gender": "Male", + "Locale": "id-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ardi Online (Natural) - Indonesian (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (id-ID, GadisNeural)", + "ShortName": "id-ID-GadisNeural", + "Gender": "Female", + "Locale": "id-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gadis Online (Natural) - Indonesian (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ga-IE, ColmNeural)", + "ShortName": "ga-IE-ColmNeural", + "Gender": "Male", + "Locale": "ga-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Colm Online (Natural) - Irish (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ga-IE, OrlaNeural)", + "ShortName": "ga-IE-OrlaNeural", + "Gender": "Female", + "Locale": "ga-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Orla Online (Natural) - Irish (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, GiuseppeNeural)", + "ShortName": "it-IT-GiuseppeNeural", + "Gender": "Male", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Giuseppe Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, DiegoNeural)", + "ShortName": "it-IT-DiegoNeural", + "Gender": "Male", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Diego Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)", + "ShortName": "it-IT-ElsaNeural", + "Gender": "Female", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elsa Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, IsabellaNeural)", + "ShortName": "it-IT-IsabellaNeural", + "Gender": "Female", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Isabella Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ja-JP, KeitaNeural)", + "ShortName": "ja-JP-KeitaNeural", + "Gender": "Male", + "Locale": "ja-JP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Keita Online (Natural) - Japanese (Japan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ja-JP, NanamiNeural)", + "ShortName": "ja-JP-NanamiNeural", + "Gender": "Female", + "Locale": "ja-JP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nanami Online (Natural) - Japanese (Japan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (jv-ID, DimasNeural)", + "ShortName": "jv-ID-DimasNeural", + "Gender": "Male", + "Locale": "jv-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dimas Online (Natural) - Javanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (jv-ID, SitiNeural)", + "ShortName": "jv-ID-SitiNeural", + "Gender": "Female", + "Locale": "jv-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Siti Online (Natural) - Javanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kn-IN, GaganNeural)", + "ShortName": "kn-IN-GaganNeural", + "Gender": "Male", + "Locale": "kn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gagan Online (Natural) - Kannada (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kn-IN, SapnaNeural)", + "ShortName": "kn-IN-SapnaNeural", + "Gender": "Female", + "Locale": "kn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sapna Online (Natural) - Kannada (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kk-KZ, AigulNeural)", + "ShortName": "kk-KZ-AigulNeural", + "Gender": "Female", + "Locale": "kk-KZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aigul Online (Natural) - Kazakh (Kazakhstan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kk-KZ, DauletNeural)", + "ShortName": "kk-KZ-DauletNeural", + "Gender": "Male", + "Locale": "kk-KZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Daulet Online (Natural) - Kazakh (Kazakhstan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (km-KH, PisethNeural)", + "ShortName": "km-KH-PisethNeural", + "Gender": "Male", + "Locale": "km-KH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Piseth Online (Natural) - Khmer (Cambodia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (km-KH, SreymomNeural)", + "ShortName": "km-KH-SreymomNeural", + "Gender": "Female", + "Locale": "km-KH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sreymom Online (Natural) - Khmer (Cambodia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, HyunsuNeural)", + "ShortName": "ko-KR-HyunsuNeural", + "Gender": "Male", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hyunsu Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, InJoonNeural)", + "ShortName": "ko-KR-InJoonNeural", + "Gender": "Male", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft InJoon Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, SunHiNeural)", + "ShortName": "ko-KR-SunHiNeural", + "Gender": "Female", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft SunHi Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lo-LA, ChanthavongNeural)", + "ShortName": "lo-LA-ChanthavongNeural", + "Gender": "Male", + "Locale": "lo-LA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Chanthavong Online (Natural) - Lao (Laos)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lo-LA, KeomanyNeural)", + "ShortName": "lo-LA-KeomanyNeural", + "Gender": "Female", + "Locale": "lo-LA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Keomany Online (Natural) - Lao (Laos)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lv-LV, EveritaNeural)", + "ShortName": "lv-LV-EveritaNeural", + "Gender": "Female", + "Locale": "lv-LV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Everita Online (Natural) - Latvian (Latvia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lv-LV, NilsNeural)", + "ShortName": "lv-LV-NilsNeural", + "Gender": "Male", + "Locale": "lv-LV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nils Online (Natural) - Latvian (Latvia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lt-LT, LeonasNeural)", + "ShortName": "lt-LT-LeonasNeural", + "Gender": "Male", + "Locale": "lt-LT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leonas Online (Natural) - Lithuanian (Lithuania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lt-LT, OnaNeural)", + "ShortName": "lt-LT-OnaNeural", + "Gender": "Female", + "Locale": "lt-LT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ona Online (Natural) - Lithuanian (Lithuania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mk-MK, AleksandarNeural)", + "ShortName": "mk-MK-AleksandarNeural", + "Gender": "Male", + "Locale": "mk-MK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aleksandar Online (Natural) - Macedonian (Republic of North Macedonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mk-MK, MarijaNeural)", + "ShortName": "mk-MK-MarijaNeural", + "Gender": "Female", + "Locale": "mk-MK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marija Online (Natural) - Macedonian (Republic of North Macedonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ms-MY, OsmanNeural)", + "ShortName": "ms-MY-OsmanNeural", + "Gender": "Male", + "Locale": "ms-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Osman Online (Natural) - Malay (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ms-MY, YasminNeural)", + "ShortName": "ms-MY-YasminNeural", + "Gender": "Female", + "Locale": "ms-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yasmin Online (Natural) - Malay (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ml-IN, MidhunNeural)", + "ShortName": "ml-IN-MidhunNeural", + "Gender": "Male", + "Locale": "ml-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Midhun Online (Natural) - Malayalam (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ml-IN, SobhanaNeural)", + "ShortName": "ml-IN-SobhanaNeural", + "Gender": "Female", + "Locale": "ml-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sobhana Online (Natural) - Malayalam (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mt-MT, GraceNeural)", + "ShortName": "mt-MT-GraceNeural", + "Gender": "Female", + "Locale": "mt-MT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Grace Online (Natural) - Maltese (Malta)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mt-MT, JosephNeural)", + "ShortName": "mt-MT-JosephNeural", + "Gender": "Male", + "Locale": "mt-MT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Joseph Online (Natural) - Maltese (Malta)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mr-IN, AarohiNeural)", + "ShortName": "mr-IN-AarohiNeural", + "Gender": "Female", + "Locale": "mr-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aarohi Online (Natural) - Marathi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mr-IN, ManoharNeural)", + "ShortName": "mr-IN-ManoharNeural", + "Gender": "Male", + "Locale": "mr-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Manohar Online (Natural) - Marathi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mn-MN, BataaNeural)", + "ShortName": "mn-MN-BataaNeural", + "Gender": "Male", + "Locale": "mn-MN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bataa Online (Natural) - Mongolian (Mongolia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mn-MN, YesuiNeural)", + "ShortName": "mn-MN-YesuiNeural", + "Gender": "Female", + "Locale": "mn-MN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yesui Online (Natural) - Mongolian (Mongolia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ne-NP, HemkalaNeural)", + "ShortName": "ne-NP-HemkalaNeural", + "Gender": "Female", + "Locale": "ne-NP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hemkala Online (Natural) - Nepali (Nepal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ne-NP, SagarNeural)", + "ShortName": "ne-NP-SagarNeural", + "Gender": "Male", + "Locale": "ne-NP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sagar Online (Natural) - Nepali (Nepal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nb-NO, FinnNeural)", + "ShortName": "nb-NO-FinnNeural", + "Gender": "Male", + "Locale": "nb-NO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Finn Online (Natural) - Norwegian (Bokmål Norway)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nb-NO, PernilleNeural)", + "ShortName": "nb-NO-PernilleNeural", + "Gender": "Female", + "Locale": "nb-NO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pernille Online (Natural) - Norwegian (Bokmål, Norway)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ps-AF, GulNawazNeural)", + "ShortName": "ps-AF-GulNawazNeural", + "Gender": "Male", + "Locale": "ps-AF", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft GulNawaz Online (Natural) - Pashto (Afghanistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ps-AF, LatifaNeural)", + "ShortName": "ps-AF-LatifaNeural", + "Gender": "Female", + "Locale": "ps-AF", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Latifa Online (Natural) - Pashto (Afghanistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fa-IR, DilaraNeural)", + "ShortName": "fa-IR-DilaraNeural", + "Gender": "Female", + "Locale": "fa-IR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dilara Online (Natural) - Persian (Iran)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fa-IR, FaridNeural)", + "ShortName": "fa-IR-FaridNeural", + "Gender": "Male", + "Locale": "fa-IR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Farid Online (Natural) - Persian (Iran)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pl-PL, MarekNeural)", + "ShortName": "pl-PL-MarekNeural", + "Gender": "Male", + "Locale": "pl-PL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marek Online (Natural) - Polish (Poland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pl-PL, ZofiaNeural)", + "ShortName": "pl-PL-ZofiaNeural", + "Gender": "Female", + "Locale": "pl-PL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zofia Online (Natural) - Polish (Poland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, ThalitaNeural)", + "ShortName": "pt-BR-ThalitaNeural", + "Gender": "Female", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thalita Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, AntonioNeural)", + "ShortName": "pt-BR-AntonioNeural", + "Gender": "Male", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antonio Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, FranciscaNeural)", + "ShortName": "pt-BR-FranciscaNeural", + "Gender": "Female", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Francisca Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-PT, DuarteNeural)", + "ShortName": "pt-PT-DuarteNeural", + "Gender": "Male", + "Locale": "pt-PT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Duarte Online (Natural) - Portuguese (Portugal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-PT, RaquelNeural)", + "ShortName": "pt-PT-RaquelNeural", + "Gender": "Female", + "Locale": "pt-PT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Raquel Online (Natural) - Portuguese (Portugal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ro-RO, AlinaNeural)", + "ShortName": "ro-RO-AlinaNeural", + "Gender": "Female", + "Locale": "ro-RO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alina Online (Natural) - Romanian (Romania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ro-RO, EmilNeural)", + "ShortName": "ro-RO-EmilNeural", + "Gender": "Male", + "Locale": "ro-RO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emil Online (Natural) - Romanian (Romania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ru-RU, DmitryNeural)", + "ShortName": "ru-RU-DmitryNeural", + "Gender": "Male", + "Locale": "ru-RU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dmitry Online (Natural) - Russian (Russia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ru-RU, SvetlanaNeural)", + "ShortName": "ru-RU-SvetlanaNeural", + "Gender": "Female", + "Locale": "ru-RU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Svetlana Online (Natural) - Russian (Russia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sr-RS, NicholasNeural)", + "ShortName": "sr-RS-NicholasNeural", + "Gender": "Male", + "Locale": "sr-RS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nicholas Online (Natural) - Serbian (Serbia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sr-RS, SophieNeural)", + "ShortName": "sr-RS-SophieNeural", + "Gender": "Female", + "Locale": "sr-RS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sophie Online (Natural) - Serbian (Serbia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (si-LK, SameeraNeural)", + "ShortName": "si-LK-SameeraNeural", + "Gender": "Male", + "Locale": "si-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sameera Online (Natural) - Sinhala (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (si-LK, ThiliniNeural)", + "ShortName": "si-LK-ThiliniNeural", + "Gender": "Female", + "Locale": "si-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thilini Online (Natural) - Sinhala (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sk-SK, LukasNeural)", + "ShortName": "sk-SK-LukasNeural", + "Gender": "Male", + "Locale": "sk-SK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lukas Online (Natural) - Slovak (Slovakia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sk-SK, ViktoriaNeural)", + "ShortName": "sk-SK-ViktoriaNeural", + "Gender": "Female", + "Locale": "sk-SK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Viktoria Online (Natural) - Slovak (Slovakia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sl-SI, PetraNeural)", + "ShortName": "sl-SI-PetraNeural", + "Gender": "Female", + "Locale": "sl-SI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Petra Online (Natural) - Slovenian (Slovenia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sl-SI, RokNeural)", + "ShortName": "sl-SI-RokNeural", + "Gender": "Male", + "Locale": "sl-SI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rok Online (Natural) - Slovenian (Slovenia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (so-SO, MuuseNeural)", + "ShortName": "so-SO-MuuseNeural", + "Gender": "Male", + "Locale": "so-SO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Muuse Online (Natural) - Somali (Somalia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (so-SO, UbaxNeural)", + "ShortName": "so-SO-UbaxNeural", + "Gender": "Female", + "Locale": "so-SO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ubax Online (Natural) - Somali (Somalia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-AR, ElenaNeural)", + "ShortName": "es-AR-ElenaNeural", + "Gender": "Female", + "Locale": "es-AR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elena Online (Natural) - Spanish (Argentina)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-AR, TomasNeural)", + "ShortName": "es-AR-TomasNeural", + "Gender": "Male", + "Locale": "es-AR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tomas Online (Natural) - Spanish (Argentina)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-BO, MarceloNeural)", + "ShortName": "es-BO-MarceloNeural", + "Gender": "Male", + "Locale": "es-BO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marcelo Online (Natural) - Spanish (Bolivia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-BO, SofiaNeural)", + "ShortName": "es-BO-SofiaNeural", + "Gender": "Female", + "Locale": "es-BO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sofia Online (Natural) - Spanish (Bolivia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CL, CatalinaNeural)", + "ShortName": "es-CL-CatalinaNeural", + "Gender": "Female", + "Locale": "es-CL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Catalina Online (Natural) - Spanish (Chile)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CL, LorenzoNeural)", + "ShortName": "es-CL-LorenzoNeural", + "Gender": "Male", + "Locale": "es-CL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lorenzo Online (Natural) - Spanish (Chile)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, XimenaNeural)", + "ShortName": "es-ES-XimenaNeural", + "Gender": "Female", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ximena Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CO, GonzaloNeural)", + "ShortName": "es-CO-GonzaloNeural", + "Gender": "Male", + "Locale": "es-CO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gonzalo Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CO, SalomeNeural)", + "ShortName": "es-CO-SalomeNeural", + "Gender": "Female", + "Locale": "es-CO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salome Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CR, JuanNeural)", + "ShortName": "es-CR-JuanNeural", + "Gender": "Male", + "Locale": "es-CR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Juan Online (Natural) - Spanish (Costa Rica)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CR, MariaNeural)", + "ShortName": "es-CR-MariaNeural", + "Gender": "Female", + "Locale": "es-CR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maria Online (Natural) - Spanish (Costa Rica)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CU, BelkysNeural)", + "ShortName": "es-CU-BelkysNeural", + "Gender": "Female", + "Locale": "es-CU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Belkys Online (Natural) - Spanish (Cuba)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CU, ManuelNeural)", + "ShortName": "es-CU-ManuelNeural", + "Gender": "Male", + "Locale": "es-CU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Manuel Online (Natural) - Spanish (Cuba)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-DO, EmilioNeural)", + "ShortName": "es-DO-EmilioNeural", + "Gender": "Male", + "Locale": "es-DO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emilio Online (Natural) - Spanish (Dominican Republic)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-DO, RamonaNeural)", + "ShortName": "es-DO-RamonaNeural", + "Gender": "Female", + "Locale": "es-DO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ramona Online (Natural) - Spanish (Dominican Republic)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-EC, AndreaNeural)", + "ShortName": "es-EC-AndreaNeural", + "Gender": "Female", + "Locale": "es-EC", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andrea Online (Natural) - Spanish (Ecuador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-EC, LuisNeural)", + "ShortName": "es-EC-LuisNeural", + "Gender": "Male", + "Locale": "es-EC", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luis Online (Natural) - Spanish (Ecuador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-SV, LorenaNeural)", + "ShortName": "es-SV-LorenaNeural", + "Gender": "Female", + "Locale": "es-SV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lorena Online (Natural) - Spanish (El Salvador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-SV, RodrigoNeural)", + "ShortName": "es-SV-RodrigoNeural", + "Gender": "Male", + "Locale": "es-SV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rodrigo Online (Natural) - Spanish (El Salvador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GQ, JavierNeural)", + "ShortName": "es-GQ-JavierNeural", + "Gender": "Male", + "Locale": "es-GQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Javier Online (Natural) - Spanish (Equatorial Guinea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GQ, TeresaNeural)", + "ShortName": "es-GQ-TeresaNeural", + "Gender": "Female", + "Locale": "es-GQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Teresa Online (Natural) - Spanish (Equatorial Guinea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GT, AndresNeural)", + "ShortName": "es-GT-AndresNeural", + "Gender": "Male", + "Locale": "es-GT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andres Online (Natural) - Spanish (Guatemala)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GT, MartaNeural)", + "ShortName": "es-GT-MartaNeural", + "Gender": "Female", + "Locale": "es-GT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marta Online (Natural) - Spanish (Guatemala)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-HN, CarlosNeural)", + "ShortName": "es-HN-CarlosNeural", + "Gender": "Male", + "Locale": "es-HN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Carlos Online (Natural) - Spanish (Honduras)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-HN, KarlaNeural)", + "ShortName": "es-HN-KarlaNeural", + "Gender": "Female", + "Locale": "es-HN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Karla Online (Natural) - Spanish (Honduras)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-MX, DaliaNeural)", + "ShortName": "es-MX-DaliaNeural", + "Gender": "Female", + "Locale": "es-MX", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dalia Online (Natural) - Spanish (Mexico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-MX, JorgeNeural)", + "ShortName": "es-MX-JorgeNeural", + "Gender": "Male", + "Locale": "es-MX", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jorge Online (Natural) - Spanish (Mexico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-NI, FedericoNeural)", + "ShortName": "es-NI-FedericoNeural", + "Gender": "Male", + "Locale": "es-NI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Federico Online (Natural) - Spanish (Nicaragua)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-NI, YolandaNeural)", + "ShortName": "es-NI-YolandaNeural", + "Gender": "Female", + "Locale": "es-NI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yolanda Online (Natural) - Spanish (Nicaragua)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PA, MargaritaNeural)", + "ShortName": "es-PA-MargaritaNeural", + "Gender": "Female", + "Locale": "es-PA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Margarita Online (Natural) - Spanish (Panama)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PA, RobertoNeural)", + "ShortName": "es-PA-RobertoNeural", + "Gender": "Male", + "Locale": "es-PA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roberto Online (Natural) - Spanish (Panama)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PY, MarioNeural)", + "ShortName": "es-PY-MarioNeural", + "Gender": "Male", + "Locale": "es-PY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mario Online (Natural) - Spanish (Paraguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PY, TaniaNeural)", + "ShortName": "es-PY-TaniaNeural", + "Gender": "Female", + "Locale": "es-PY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tania Online (Natural) - Spanish (Paraguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PE, AlexNeural)", + "ShortName": "es-PE-AlexNeural", + "Gender": "Male", + "Locale": "es-PE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alex Online (Natural) - Spanish (Peru)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PE, CamilaNeural)", + "ShortName": "es-PE-CamilaNeural", + "Gender": "Female", + "Locale": "es-PE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Camila Online (Natural) - Spanish (Peru)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PR, KarinaNeural)", + "ShortName": "es-PR-KarinaNeural", + "Gender": "Female", + "Locale": "es-PR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Karina Online (Natural) - Spanish (Puerto Rico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PR, VictorNeural)", + "ShortName": "es-PR-VictorNeural", + "Gender": "Male", + "Locale": "es-PR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Victor Online (Natural) - Spanish (Puerto Rico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, AlvaroNeural)", + "ShortName": "es-ES-AlvaroNeural", + "Gender": "Male", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alvaro Online (Natural) - Spanish (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, ElviraNeural)", + "ShortName": "es-ES-ElviraNeural", + "Gender": "Female", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elvira Online (Natural) - Spanish (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-US, AlonsoNeural)", + "ShortName": "es-US-AlonsoNeural", + "Gender": "Male", + "Locale": "es-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alonso Online (Natural) - Spanish (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-US, PalomaNeural)", + "ShortName": "es-US-PalomaNeural", + "Gender": "Female", + "Locale": "es-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Paloma Online (Natural) - Spanish (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-UY, MateoNeural)", + "ShortName": "es-UY-MateoNeural", + "Gender": "Male", + "Locale": "es-UY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mateo Online (Natural) - Spanish (Uruguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-UY, ValentinaNeural)", + "ShortName": "es-UY-ValentinaNeural", + "Gender": "Female", + "Locale": "es-UY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Valentina Online (Natural) - Spanish (Uruguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-VE, PaolaNeural)", + "ShortName": "es-VE-PaolaNeural", + "Gender": "Female", + "Locale": "es-VE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Paola Online (Natural) - Spanish (Venezuela)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-VE, SebastianNeural)", + "ShortName": "es-VE-SebastianNeural", + "Gender": "Male", + "Locale": "es-VE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sebastian Online (Natural) - Spanish (Venezuela)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (su-ID, JajangNeural)", + "ShortName": "su-ID-JajangNeural", + "Gender": "Male", + "Locale": "su-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jajang Online (Natural) - Sundanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (su-ID, TutiNeural)", + "ShortName": "su-ID-TutiNeural", + "Gender": "Female", + "Locale": "su-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tuti Online (Natural) - Sundanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-KE, RafikiNeural)", + "ShortName": "sw-KE-RafikiNeural", + "Gender": "Male", + "Locale": "sw-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rafiki Online (Natural) - Swahili (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-KE, ZuriNeural)", + "ShortName": "sw-KE-ZuriNeural", + "Gender": "Female", + "Locale": "sw-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zuri Online (Natural) - Swahili (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-TZ, DaudiNeural)", + "ShortName": "sw-TZ-DaudiNeural", + "Gender": "Male", + "Locale": "sw-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Daudi Online (Natural) - Swahili (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-TZ, RehemaNeural)", + "ShortName": "sw-TZ-RehemaNeural", + "Gender": "Female", + "Locale": "sw-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rehema Online (Natural) - Swahili (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sv-SE, MattiasNeural)", + "ShortName": "sv-SE-MattiasNeural", + "Gender": "Male", + "Locale": "sv-SE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mattias Online (Natural) - Swedish (Sweden)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sv-SE, SofieNeural)", + "ShortName": "sv-SE-SofieNeural", + "Gender": "Female", + "Locale": "sv-SE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sofie Online (Natural) - Swedish (Sweden)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-IN, PallaviNeural)", + "ShortName": "ta-IN-PallaviNeural", + "Gender": "Female", + "Locale": "ta-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pallavi Online (Natural) - Tamil (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-IN, ValluvarNeural)", + "ShortName": "ta-IN-ValluvarNeural", + "Gender": "Male", + "Locale": "ta-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Valluvar Online (Natural) - Tamil (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-MY, KaniNeural)", + "ShortName": "ta-MY-KaniNeural", + "Gender": "Female", + "Locale": "ta-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kani Online (Natural) - Tamil (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-MY, SuryaNeural)", + "ShortName": "ta-MY-SuryaNeural", + "Gender": "Male", + "Locale": "ta-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Surya Online (Natural) - Tamil (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-SG, AnbuNeural)", + "ShortName": "ta-SG-AnbuNeural", + "Gender": "Male", + "Locale": "ta-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anbu Online (Natural) - Tamil (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-SG, VenbaNeural)", + "ShortName": "ta-SG-VenbaNeural", + "Gender": "Female", + "Locale": "ta-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Venba Online (Natural) - Tamil (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-LK, KumarNeural)", + "ShortName": "ta-LK-KumarNeural", + "Gender": "Male", + "Locale": "ta-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kumar Online (Natural) - Tamil (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-LK, SaranyaNeural)", + "ShortName": "ta-LK-SaranyaNeural", + "Gender": "Female", + "Locale": "ta-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Saranya Online (Natural) - Tamil (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (te-IN, MohanNeural)", + "ShortName": "te-IN-MohanNeural", + "Gender": "Male", + "Locale": "te-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mohan Online (Natural) - Telugu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (te-IN, ShrutiNeural)", + "ShortName": "te-IN-ShrutiNeural", + "Gender": "Female", + "Locale": "te-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Shruti Online (Natural) - Telugu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (th-TH, NiwatNeural)", + "ShortName": "th-TH-NiwatNeural", + "Gender": "Male", + "Locale": "th-TH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Niwat Online (Natural) - Thai (Thailand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (th-TH, PremwadeeNeural)", + "ShortName": "th-TH-PremwadeeNeural", + "Gender": "Female", + "Locale": "th-TH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Premwadee Online (Natural) - Thai (Thailand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (tr-TR, AhmetNeural)", + "ShortName": "tr-TR-AhmetNeural", + "Gender": "Male", + "Locale": "tr-TR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ahmet Online (Natural) - Turkish (Turkey)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (tr-TR, EmelNeural)", + "ShortName": "tr-TR-EmelNeural", + "Gender": "Female", + "Locale": "tr-TR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emel Online (Natural) - Turkish (Turkey)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uk-UA, OstapNeural)", + "ShortName": "uk-UA-OstapNeural", + "Gender": "Male", + "Locale": "uk-UA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ostap Online (Natural) - Ukrainian (Ukraine)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uk-UA, PolinaNeural)", + "ShortName": "uk-UA-PolinaNeural", + "Gender": "Female", + "Locale": "uk-UA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Polina Online (Natural) - Ukrainian (Ukraine)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-IN, GulNeural)", + "ShortName": "ur-IN-GulNeural", + "Gender": "Female", + "Locale": "ur-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gul Online (Natural) - Urdu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-IN, SalmanNeural)", + "ShortName": "ur-IN-SalmanNeural", + "Gender": "Male", + "Locale": "ur-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salman Online (Natural) - Urdu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-PK, AsadNeural)", + "ShortName": "ur-PK-AsadNeural", + "Gender": "Male", + "Locale": "ur-PK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Asad Online (Natural) - Urdu (Pakistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-PK, UzmaNeural)", + "ShortName": "ur-PK-UzmaNeural", + "Gender": "Female", + "Locale": "ur-PK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Uzma Online (Natural) - Urdu (Pakistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uz-UZ, MadinaNeural)", + "ShortName": "uz-UZ-MadinaNeural", + "Gender": "Female", + "Locale": "uz-UZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Madina Online (Natural) - Uzbek (Uzbekistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uz-UZ, SardorNeural)", + "ShortName": "uz-UZ-SardorNeural", + "Gender": "Male", + "Locale": "uz-UZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sardor Online (Natural) - Uzbek (Uzbekistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (vi-VN, HoaiMyNeural)", + "ShortName": "vi-VN-HoaiMyNeural", + "Gender": "Female", + "Locale": "vi-VN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HoaiMy Online (Natural) - Vietnamese (Vietnam)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (vi-VN, NamMinhNeural)", + "ShortName": "vi-VN-NamMinhNeural", + "Gender": "Male", + "Locale": "vi-VN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft NamMinh Online (Natural) - Vietnamese (Vietnam)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cy-GB, AledNeural)", + "ShortName": "cy-GB-AledNeural", + "Gender": "Male", + "Locale": "cy-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aled Online (Natural) - Welsh (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cy-GB, NiaNeural)", + "ShortName": "cy-GB-NiaNeural", + "Gender": "Female", + "Locale": "cy-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nia Online (Natural) - Welsh (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zu-ZA, ThandoNeural)", + "ShortName": "zu-ZA-ThandoNeural", + "Gender": "Female", + "Locale": "zu-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thando Online (Natural) - Zulu (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zu-ZA, ThembaNeural)", + "ShortName": "zu-ZA-ThembaNeural", + "Gender": "Male", + "Locale": "zu-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Themba Online (Natural) - Zulu (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + } +] \ No newline at end of file diff --git a/rvc/lib/utils.py b/rvc/lib/utils.py new file mode 100644 index 00000000..c514e8bb --- /dev/null +++ b/rvc/lib/utils.py @@ -0,0 +1,137 @@ +import os, sys +import librosa +import soundfile as sf +import numpy as np +import re +import unicodedata +import wget +from pydub import AudioSegment +from torch import nn + +import logging +from transformers import HubertModel +import warnings + +# Remove this to see warnings about transformers models +warnings.filterwarnings("ignore") + +logging.getLogger("fairseq").setLevel(logging.ERROR) +logging.getLogger("faiss.loader").setLevel(logging.ERROR) +logging.getLogger("transformers").setLevel(logging.ERROR) +logging.getLogger("torch").setLevel(logging.ERROR) + +now_dir = os.getcwd() +sys.path.append(now_dir) + +base_path = os.path.join(now_dir, "rvc", "models", "formant", "stftpitchshift") +stft = base_path + ".exe" if sys.platform == "win32" else base_path + + +class HubertModelWithFinalProj(HubertModel): + def __init__(self, config): + super().__init__(config) + self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) + + +def load_audio(file, sample_rate): + try: + file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + audio, sr = sf.read(file) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.T) + if sr != sample_rate: + audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate) + except Exception as error: + raise RuntimeError(f"An error occurred loading the audio: {error}") + + return audio.flatten() + + +def load_audio_infer( + file, + sample_rate, + **kwargs, +): + formant_shifting = kwargs.get("formant_shifting", False) + try: + file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + if not os.path.isfile(file): + raise FileNotFoundError(f"File not found: {file}") + audio, sr = sf.read(file) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.T) + if sr != sample_rate: + audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate) + if formant_shifting: + formant_qfrency = kwargs.get("formant_qfrency", 0.8) + formant_timbre = kwargs.get("formant_timbre", 0.8) + + from stftpitchshift import StftPitchShift + + pitchshifter = StftPitchShift(1024, 32, sample_rate) + audio = pitchshifter.shiftpitch( + audio, + factors=1, + quefrency=formant_qfrency * 1e-3, + distortion=formant_timbre, + ) + except Exception as error: + raise RuntimeError(f"An error occurred loading the audio: {error}") + return np.array(audio).flatten() + + +def format_title(title): + formatted_title = ( + unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8") + ) + formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title) + formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title) + formatted_title = re.sub(r"\s+", "_", formatted_title) + return formatted_title + + +def load_embedding(embedder_model, custom_embedder=None): + embedder_root = os.path.join(now_dir, "rvc", "models", "embedders") + embedding_list = { + "contentvec": os.path.join(embedder_root, "contentvec"), + "chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"), + "japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"), + "korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"), + } + + online_embedders = { + "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/pytorch_model.bin", + "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin", + "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin", + "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin", + } + + config_files = { + "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/config.json", + "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/config.json", + "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/config.json", + "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/config.json", + } + + if embedder_model == "custom": + if os.path.exists(custom_embedder): + model_path = custom_embedder + else: + print(f"Custom embedder not found: {custom_embedder}, using contentvec") + model_path = embedding_list["contentvec"] + else: + model_path = embedding_list[embedder_model] + bin_file = os.path.join(model_path, "pytorch_model.bin") + json_file = os.path.join(model_path, "config.json") + os.makedirs(model_path, exist_ok=True) + if not os.path.exists(bin_file): + url = online_embedders[embedder_model] + print(f"Downloading {url} to {model_path}...") + wget.download(url, out=bin_file) + if not os.path.exists(json_file): + url = config_files[embedder_model] + print(f"Downloading {url} to {model_path}...") + wget.download(url, out=json_file) + + models = HubertModelWithFinalProj.from_pretrained(model_path) + return models diff --git a/rvc/lib/zluda.py b/rvc/lib/zluda.py new file mode 100644 index 00000000..482009cc --- /dev/null +++ b/rvc/lib/zluda.py @@ -0,0 +1,43 @@ +import torch + +if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): + _torch_stft = torch.stft + + def z_stft( + audio: torch.Tensor, + n_fft: int, + hop_length: int = None, + win_length: int = None, + window: torch.Tensor = None, + center: bool = True, + pad_mode: str = "reflect", + normalized: bool = False, + onesided: bool = None, + return_complex: bool = None, + ): + sd = audio.device + return _torch_stft( + audio.to("cpu"), + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window.to("cpu"), + center=center, + pad_mode=pad_mode, + normalized=normalized, + onesided=onesided, + return_complex=return_complex, + ).to(sd) + + def z_jit(f, *_, **__): + f.graph = torch._C.Graph() + return f + + # hijacks + torch.stft = z_stft + torch.jit.script = z_jit + # disabling unsupported cudnn + torch.backends.cudnn.enabled = False + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + torch.backends.cuda.enable_mem_efficient_sdp(False) diff --git a/rvc_cli.py b/rvc_cli.py new file mode 100644 index 00000000..d0bd406a --- /dev/null +++ b/rvc_cli.py @@ -0,0 +1,1897 @@ +import os +import sys +import json +import argparse +import subprocess +from functools import lru_cache +from distutils.util import strtobool + +now_dir = os.getcwd() +sys.path.append(now_dir) + +current_script_directory = os.path.dirname(os.path.realpath(__file__)) +logs_path = os.path.join(current_script_directory, "logs") + +from rvc.lib.tools.analyzer import analyze_audio +from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline +from rvc.lib.tools.model_download import model_download_pipeline + +python = sys.executable + + +# Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4 +@lru_cache(maxsize=1) # Cache only one result since the file is static +def load_voices_data(): + with open( + os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8" + ) as file: + return json.load(file) + + +voices_data = load_voices_data() +locales = list({voice["ShortName"] for voice in voices_data}) + + +@lru_cache(maxsize=None) +def import_voice_converter(): + from rvc.infer.infer import VoiceConverter + + return VoiceConverter() + + +@lru_cache(maxsize=1) +def get_config(): + from rvc.configs.config import Config + + return Config() + + +# Infer +def run_infer_script( + pitch: int, + filter_radius: int, + index_rate: float, + volume_envelope: int, + protect: float, + hop_length: int, + f0_method: str, + input_path: str, + output_path: str, + pth_path: str, + index_path: str, + split_audio: bool, + f0_autotune: bool, + f0_autotune_strength: float, + clean_audio: bool, + clean_strength: float, + export_format: str, + f0_file: str, + embedder_model: str, + embedder_model_custom: str = None, + formant_shifting: bool = False, + formant_qfrency: float = 1.0, + formant_timbre: float = 1.0, + post_process: bool = False, + reverb: bool = False, + pitch_shift: bool = False, + limiter: bool = False, + gain: bool = False, + distortion: bool = False, + chorus: bool = False, + bitcrush: bool = False, + clipping: bool = False, + compressor: bool = False, + delay: bool = False, + reverb_room_size: float = 0.5, + reverb_damping: float = 0.5, + reverb_wet_gain: float = 0.5, + reverb_dry_gain: float = 0.5, + reverb_width: float = 0.5, + reverb_freeze_mode: float = 0.5, + pitch_shift_semitones: float = 0.0, + limiter_threshold: float = -6, + limiter_release_time: float = 0.01, + gain_db: float = 0.0, + distortion_gain: float = 25, + chorus_rate: float = 1.0, + chorus_depth: float = 0.25, + chorus_center_delay: float = 7, + chorus_feedback: float = 0.0, + chorus_mix: float = 0.5, + bitcrush_bit_depth: int = 8, + clipping_threshold: float = -6, + compressor_threshold: float = 0, + compressor_ratio: float = 1, + compressor_attack: float = 1.0, + compressor_release: float = 100, + delay_seconds: float = 0.5, + delay_feedback: float = 0.0, + delay_mix: float = 0.5, + sid: int = 0, +): + kwargs = { + "audio_input_path": input_path, + "audio_output_path": output_path, + "model_path": pth_path, + "index_path": index_path, + "pitch": pitch, + "filter_radius": filter_radius, + "index_rate": index_rate, + "volume_envelope": volume_envelope, + "protect": protect, + "hop_length": hop_length, + "f0_method": f0_method, + "pth_path": pth_path, + "index_path": index_path, + "split_audio": split_audio, + "f0_autotune": f0_autotune, + "f0_autotune_strength": f0_autotune_strength, + "clean_audio": clean_audio, + "clean_strength": clean_strength, + "export_format": export_format, + "f0_file": f0_file, + "embedder_model": embedder_model, + "embedder_model_custom": embedder_model_custom, + "post_process": post_process, + "formant_shifting": formant_shifting, + "formant_qfrency": formant_qfrency, + "formant_timbre": formant_timbre, + "reverb": reverb, + "pitch_shift": pitch_shift, + "limiter": limiter, + "gain": gain, + "distortion": distortion, + "chorus": chorus, + "bitcrush": bitcrush, + "clipping": clipping, + "compressor": compressor, + "delay": delay, + "reverb_room_size": reverb_room_size, + "reverb_damping": reverb_damping, + "reverb_wet_level": reverb_wet_gain, + "reverb_dry_level": reverb_dry_gain, + "reverb_width": reverb_width, + "reverb_freeze_mode": reverb_freeze_mode, + "pitch_shift_semitones": pitch_shift_semitones, + "limiter_threshold": limiter_threshold, + "limiter_release": limiter_release_time, + "gain_db": gain_db, + "distortion_gain": distortion_gain, + "chorus_rate": chorus_rate, + "chorus_depth": chorus_depth, + "chorus_delay": chorus_center_delay, + "chorus_feedback": chorus_feedback, + "chorus_mix": chorus_mix, + "bitcrush_bit_depth": bitcrush_bit_depth, + "clipping_threshold": clipping_threshold, + "compressor_threshold": compressor_threshold, + "compressor_ratio": compressor_ratio, + "compressor_attack": compressor_attack, + "compressor_release": compressor_release, + "delay_seconds": delay_seconds, + "delay_feedback": delay_feedback, + "delay_mix": delay_mix, + "sid": sid, + } + infer_pipeline = import_voice_converter() + infer_pipeline.convert_audio( + **kwargs, + ) + return f"File {input_path} inferred successfully.", output_path.replace( + ".wav", f".{export_format.lower()}" + ) + + +# Batch infer +def run_batch_infer_script( + pitch: int, + filter_radius: int, + index_rate: float, + volume_envelope: int, + protect: float, + hop_length: int, + f0_method: str, + input_folder: str, + output_folder: str, + pth_path: str, + index_path: str, + split_audio: bool, + f0_autotune: bool, + f0_autotune_strength: float, + clean_audio: bool, + clean_strength: float, + export_format: str, + f0_file: str, + embedder_model: str, + embedder_model_custom: str = None, + formant_shifting: bool = False, + formant_qfrency: float = 1.0, + formant_timbre: float = 1.0, + post_process: bool = False, + reverb: bool = False, + pitch_shift: bool = False, + limiter: bool = False, + gain: bool = False, + distortion: bool = False, + chorus: bool = False, + bitcrush: bool = False, + clipping: bool = False, + compressor: bool = False, + delay: bool = False, + reverb_room_size: float = 0.5, + reverb_damping: float = 0.5, + reverb_wet_gain: float = 0.5, + reverb_dry_gain: float = 0.5, + reverb_width: float = 0.5, + reverb_freeze_mode: float = 0.5, + pitch_shift_semitones: float = 0.0, + limiter_threshold: float = -6, + limiter_release_time: float = 0.01, + gain_db: float = 0.0, + distortion_gain: float = 25, + chorus_rate: float = 1.0, + chorus_depth: float = 0.25, + chorus_center_delay: float = 7, + chorus_feedback: float = 0.0, + chorus_mix: float = 0.5, + bitcrush_bit_depth: int = 8, + clipping_threshold: float = -6, + compressor_threshold: float = 0, + compressor_ratio: float = 1, + compressor_attack: float = 1.0, + compressor_release: float = 100, + delay_seconds: float = 0.5, + delay_feedback: float = 0.0, + delay_mix: float = 0.5, + sid: int = 0, +): + kwargs = { + "audio_input_paths": input_folder, + "audio_output_path": output_folder, + "model_path": pth_path, + "index_path": index_path, + "pitch": pitch, + "filter_radius": filter_radius, + "index_rate": index_rate, + "volume_envelope": volume_envelope, + "protect": protect, + "hop_length": hop_length, + "f0_method": f0_method, + "pth_path": pth_path, + "index_path": index_path, + "split_audio": split_audio, + "f0_autotune": f0_autotune, + "f0_autotune_strength": f0_autotune_strength, + "clean_audio": clean_audio, + "clean_strength": clean_strength, + "export_format": export_format, + "f0_file": f0_file, + "embedder_model": embedder_model, + "embedder_model_custom": embedder_model_custom, + "post_process": post_process, + "formant_shifting": formant_shifting, + "formant_qfrency": formant_qfrency, + "formant_timbre": formant_timbre, + "reverb": reverb, + "pitch_shift": pitch_shift, + "limiter": limiter, + "gain": gain, + "distortion": distortion, + "chorus": chorus, + "bitcrush": bitcrush, + "clipping": clipping, + "compressor": compressor, + "delay": delay, + "reverb_room_size": reverb_room_size, + "reverb_damping": reverb_damping, + "reverb_wet_level": reverb_wet_gain, + "reverb_dry_level": reverb_dry_gain, + "reverb_width": reverb_width, + "reverb_freeze_mode": reverb_freeze_mode, + "pitch_shift_semitones": pitch_shift_semitones, + "limiter_threshold": limiter_threshold, + "limiter_release": limiter_release_time, + "gain_db": gain_db, + "distortion_gain": distortion_gain, + "chorus_rate": chorus_rate, + "chorus_depth": chorus_depth, + "chorus_delay": chorus_center_delay, + "chorus_feedback": chorus_feedback, + "chorus_mix": chorus_mix, + "bitcrush_bit_depth": bitcrush_bit_depth, + "clipping_threshold": clipping_threshold, + "compressor_threshold": compressor_threshold, + "compressor_ratio": compressor_ratio, + "compressor_attack": compressor_attack, + "compressor_release": compressor_release, + "delay_seconds": delay_seconds, + "delay_feedback": delay_feedback, + "delay_mix": delay_mix, + "sid": sid, + } + infer_pipeline = import_voice_converter() + infer_pipeline.convert_audio_batch( + **kwargs, + ) + + return f"Files from {input_folder} inferred successfully." + + +# TTS +def run_tts_script( + tts_file: str, + tts_text: str, + tts_voice: str, + tts_rate: int, + pitch: int, + filter_radius: int, + index_rate: float, + volume_envelope: int, + protect: float, + hop_length: int, + f0_method: str, + output_tts_path: str, + output_rvc_path: str, + pth_path: str, + index_path: str, + split_audio: bool, + f0_autotune: bool, + f0_autotune_strength: float, + clean_audio: bool, + clean_strength: float, + export_format: str, + f0_file: str, + embedder_model: str, + embedder_model_custom: str = None, + sid: int = 0, +): + + tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py") + + if os.path.exists(output_tts_path): + os.remove(output_tts_path) + + command_tts = [ + *map( + str, + [ + python, + tts_script_path, + tts_file, + tts_text, + tts_voice, + tts_rate, + output_tts_path, + ], + ), + ] + subprocess.run(command_tts) + infer_pipeline = import_voice_converter() + infer_pipeline.convert_audio( + pitch=pitch, + filter_radius=filter_radius, + index_rate=index_rate, + volume_envelope=volume_envelope, + protect=protect, + hop_length=hop_length, + f0_method=f0_method, + audio_input_path=output_tts_path, + audio_output_path=output_rvc_path, + model_path=pth_path, + index_path=index_path, + split_audio=split_audio, + f0_autotune=f0_autotune, + f0_autotune_strength=f0_autotune_strength, + clean_audio=clean_audio, + clean_strength=clean_strength, + export_format=export_format, + f0_file=f0_file, + embedder_model=embedder_model, + embedder_model_custom=embedder_model_custom, + sid=sid, + formant_shifting=None, + formant_qfrency=None, + formant_timbre=None, + post_process=None, + reverb=None, + pitch_shift=None, + limiter=None, + gain=None, + distortion=None, + chorus=None, + bitcrush=None, + clipping=None, + compressor=None, + delay=None, + sliders=None, + ) + + return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace( + ".wav", f".{export_format.lower()}" + ) + + +# Model information +def run_model_information_script(pth_path: str): + print(model_information(pth_path)) + return model_information(pth_path) + + +# Model blender +def run_model_blender_script( + model_name: str, pth_path_1: str, pth_path_2: str, ratio: float +): + message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio) + return message, model_blended + + +# Tensorboard +def run_tensorboard_script(): + launch_tensorboard_pipeline() + + +# Download +def run_download_script(model_link: str): + model_download_pipeline(model_link) + return f"Model downloaded successfully." + + +# Audio analyzer +def run_audio_analyzer_script( + input_path: str, save_plot_path: str = "logs/audio_analysis.png" +): + audio_info, plot_path = analyze_audio(input_path, save_plot_path) + print( + f"Audio info of {input_path}: {audio_info}", + f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}", + ) + return audio_info, plot_path + + +# Parse arguments +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Run the main.py script with specific parameters." + ) + subparsers = parser.add_subparsers( + title="subcommands", dest="mode", help="Choose a mode" + ) + + # Parser for 'infer' mode + infer_parser = subparsers.add_parser("infer", help="Run inference") + pitch_description = ( + "Set the pitch of the audio. Higher values result in a higher pitch." + ) + infer_parser.add_argument( + "--pitch", + type=int, + help=pitch_description, + choices=range(-24, 25), + default=0, + ) + filter_radius_description = "Apply median filtering to the extracted pitch values if this value is greater than or equal to three. This can help reduce breathiness in the output audio." + infer_parser.add_argument( + "--filter_radius", + type=int, + help=filter_radius_description, + choices=range(11), + default=3, + ) + index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning." + infer_parser.add_argument( + "--index_rate", + type=float, + help=index_rate_description, + choices=[i / 100.0 for i in range(0, 101)], + default=0.3, + ) + volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used." + infer_parser.add_argument( + "--volume_envelope", + type=float, + help=volume_envelope_description, + choices=[i / 100.0 for i in range(0, 101)], + default=1, + ) + protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect." + infer_parser.add_argument( + "--protect", + type=float, + help=protect_description, + choices=[i / 1000.0 for i in range(0, 501)], + default=0.33, + ) + hop_length_description = "Only applicable for the Crepe pitch extraction method. Determines the time it takes for the system to react to a significant pitch change. Smaller values require more processing time but can lead to better pitch accuracy." + infer_parser.add_argument( + "--hop_length", + type=int, + help=hop_length_description, + choices=range(1, 513), + default=128, + ) + f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended." + infer_parser.add_argument( + "--f0_method", + type=str, + help=f0_method_description, + choices=[ + "crepe", + "crepe-tiny", + "rmvpe", + "fcpe", + "hybrid[crepe+rmvpe]", + "hybrid[crepe+fcpe]", + "hybrid[rmvpe+fcpe]", + "hybrid[crepe+rmvpe+fcpe]", + ], + default="rmvpe", + ) + infer_parser.add_argument( + "--input_path", + type=str, + help="Full path to the input audio file.", + required=True, + ) + infer_parser.add_argument( + "--output_path", + type=str, + help="Full path to the output audio file.", + required=True, + ) + pth_path_description = "Full path to the RVC model file (.pth)." + infer_parser.add_argument( + "--pth_path", type=str, help=pth_path_description, required=True + ) + index_path_description = "Full path to the index file (.index)." + infer_parser.add_argument( + "--index_path", type=str, help=index_path_description, required=True + ) + split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files." + infer_parser.add_argument( + "--split_audio", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=split_audio_description, + default=False, + ) + f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions." + infer_parser.add_argument( + "--f0_autotune", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=f0_autotune_description, + default=False, + ) + f0_autotune_strength_description = "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid." + infer_parser.add_argument( + "--f0_autotune_strength", + type=float, + help=f0_autotune_strength_description, + choices=[(i / 10) for i in range(11)], + default=1.0, + ) + clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions." + infer_parser.add_argument( + "--clean_audio", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=clean_audio_description, + default=False, + ) + clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound." + infer_parser.add_argument( + "--clean_strength", + type=float, + help=clean_strength_description, + choices=[(i / 10) for i in range(11)], + default=0.7, + ) + export_format_description = "Select the desired output audio format." + infer_parser.add_argument( + "--export_format", + type=str, + help=export_format_description, + choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], + default="WAV", + ) + embedder_model_description = ( + "Choose the model used for generating speaker embeddings." + ) + infer_parser.add_argument( + "--embedder_model", + type=str, + help=embedder_model_description, + choices=[ + "contentvec", + "chinese-hubert-base", + "japanese-hubert-base", + "korean-hubert-base", + "custom", + ], + default="contentvec", + ) + embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'." + infer_parser.add_argument( + "--embedder_model_custom", + type=str, + help=embedder_model_custom_description, + default=None, + ) + f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio." + infer_parser.add_argument( + "--f0_file", + type=str, + help=f0_file_description, + default=None, + ) + formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice." + infer_parser.add_argument( + "--formant_shifting", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=formant_shifting_description, + default=False, + required=False, + ) + formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect." + infer_parser.add_argument( + "--formant_qfrency", + type=float, + help=formant_qfrency_description, + default=1.0, + required=False, + ) + formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect." + infer_parser.add_argument( + "--formant_timbre", + type=float, + help=formant_timbre_description, + default=1.0, + required=False, + ) + sid_description = "Speaker ID for multi-speaker models." + infer_parser.add_argument( + "--sid", + type=int, + help=sid_description, + default=0, + required=False, + ) + post_process_description = "Apply post-processing effects to the output audio." + infer_parser.add_argument( + "--post_process", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=post_process_description, + default=False, + required=False, + ) + reverb_description = "Apply reverb effect to the output audio." + infer_parser.add_argument( + "--reverb", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=reverb_description, + default=False, + required=False, + ) + + pitch_shift_description = "Apply pitch shifting effect to the output audio." + infer_parser.add_argument( + "--pitch_shift", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=pitch_shift_description, + default=False, + required=False, + ) + + limiter_description = "Apply limiter effect to the output audio." + infer_parser.add_argument( + "--limiter", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=limiter_description, + default=False, + required=False, + ) + + gain_description = "Apply gain effect to the output audio." + infer_parser.add_argument( + "--gain", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=gain_description, + default=False, + required=False, + ) + + distortion_description = "Apply distortion effect to the output audio." + infer_parser.add_argument( + "--distortion", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=distortion_description, + default=False, + required=False, + ) + + chorus_description = "Apply chorus effect to the output audio." + infer_parser.add_argument( + "--chorus", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=chorus_description, + default=False, + required=False, + ) + + bitcrush_description = "Apply bitcrush effect to the output audio." + infer_parser.add_argument( + "--bitcrush", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=bitcrush_description, + default=False, + required=False, + ) + + clipping_description = "Apply clipping effect to the output audio." + infer_parser.add_argument( + "--clipping", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=clipping_description, + default=False, + required=False, + ) + + compressor_description = "Apply compressor effect to the output audio." + infer_parser.add_argument( + "--compressor", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=compressor_description, + default=False, + required=False, + ) + + delay_description = "Apply delay effect to the output audio." + infer_parser.add_argument( + "--delay", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=delay_description, + default=False, + required=False, + ) + + reverb_room_size_description = "Control the room size of the reverb effect. Higher values result in a larger room size." + infer_parser.add_argument( + "--reverb_room_size", + type=float, + help=reverb_room_size_description, + default=0.5, + required=False, + ) + + reverb_damping_description = "Control the damping of the reverb effect. Higher values result in a more damped sound." + infer_parser.add_argument( + "--reverb_damping", + type=float, + help=reverb_damping_description, + default=0.5, + required=False, + ) + + reverb_wet_gain_description = "Control the wet gain of the reverb effect. Higher values result in a stronger reverb effect." + infer_parser.add_argument( + "--reverb_wet_gain", + type=float, + help=reverb_wet_gain_description, + default=0.5, + required=False, + ) + + reverb_dry_gain_description = "Control the dry gain of the reverb effect. Higher values result in a stronger dry signal." + infer_parser.add_argument( + "--reverb_dry_gain", + type=float, + help=reverb_dry_gain_description, + default=0.5, + required=False, + ) + + reverb_width_description = "Control the stereo width of the reverb effect. Higher values result in a wider stereo image." + infer_parser.add_argument( + "--reverb_width", + type=float, + help=reverb_width_description, + default=0.5, + required=False, + ) + + reverb_freeze_mode_description = "Control the freeze mode of the reverb effect. Higher values result in a stronger freeze effect." + infer_parser.add_argument( + "--reverb_freeze_mode", + type=float, + help=reverb_freeze_mode_description, + default=0.5, + required=False, + ) + + pitch_shift_semitones_description = "Control the pitch shift in semitones. Positive values increase the pitch, while negative values decrease it." + infer_parser.add_argument( + "--pitch_shift_semitones", + type=float, + help=pitch_shift_semitones_description, + default=0.0, + required=False, + ) + + limiter_threshold_description = "Control the threshold of the limiter effect. Higher values result in a stronger limiting effect." + infer_parser.add_argument( + "--limiter_threshold", + type=float, + help=limiter_threshold_description, + default=-6, + required=False, + ) + + limiter_release_time_description = "Control the release time of the limiter effect. Higher values result in a longer release time." + infer_parser.add_argument( + "--limiter_release_time", + type=float, + help=limiter_release_time_description, + default=0.01, + required=False, + ) + + gain_db_description = "Control the gain in decibels. Positive values increase the gain, while negative values decrease it." + infer_parser.add_argument( + "--gain_db", + type=float, + help=gain_db_description, + default=0.0, + required=False, + ) + + distortion_gain_description = "Control the gain of the distortion effect. Higher values result in a stronger distortion effect." + infer_parser.add_argument( + "--distortion_gain", + type=float, + help=distortion_gain_description, + default=25, + required=False, + ) + + chorus_rate_description = "Control the rate of the chorus effect. Higher values result in a faster chorus effect." + infer_parser.add_argument( + "--chorus_rate", + type=float, + help=chorus_rate_description, + default=1.0, + required=False, + ) + + chorus_depth_description = "Control the depth of the chorus effect. Higher values result in a stronger chorus effect." + infer_parser.add_argument( + "--chorus_depth", + type=float, + help=chorus_depth_description, + default=0.25, + required=False, + ) + + chorus_center_delay_description = "Control the center delay of the chorus effect. Higher values result in a longer center delay." + infer_parser.add_argument( + "--chorus_center_delay", + type=float, + help=chorus_center_delay_description, + default=7, + required=False, + ) + + chorus_feedback_description = "Control the feedback of the chorus effect. Higher values result in a stronger feedback effect." + infer_parser.add_argument( + "--chorus_feedback", + type=float, + help=chorus_feedback_description, + default=0.0, + required=False, + ) + + chorus_mix_description = "Control the mix of the chorus effect. Higher values result in a stronger chorus effect." + infer_parser.add_argument( + "--chorus_mix", + type=float, + help=chorus_mix_description, + default=0.5, + required=False, + ) + + bitcrush_bit_depth_description = "Control the bit depth of the bitcrush effect. Higher values result in a stronger bitcrush effect." + infer_parser.add_argument( + "--bitcrush_bit_depth", + type=int, + help=bitcrush_bit_depth_description, + default=8, + required=False, + ) + + clipping_threshold_description = "Control the threshold of the clipping effect. Higher values result in a stronger clipping effect." + infer_parser.add_argument( + "--clipping_threshold", + type=float, + help=clipping_threshold_description, + default=-6, + required=False, + ) + + compressor_threshold_description = "Control the threshold of the compressor effect. Higher values result in a stronger compressor effect." + infer_parser.add_argument( + "--compressor_threshold", + type=float, + help=compressor_threshold_description, + default=0, + required=False, + ) + + compressor_ratio_description = "Control the ratio of the compressor effect. Higher values result in a stronger compressor effect." + infer_parser.add_argument( + "--compressor_ratio", + type=float, + help=compressor_ratio_description, + default=1, + required=False, + ) + + compressor_attack_description = "Control the attack of the compressor effect. Higher values result in a stronger compressor effect." + infer_parser.add_argument( + "--compressor_attack", + type=float, + help=compressor_attack_description, + default=1.0, + required=False, + ) + + compressor_release_description = "Control the release of the compressor effect. Higher values result in a stronger compressor effect." + infer_parser.add_argument( + "--compressor_release", + type=float, + help=compressor_release_description, + default=100, + required=False, + ) + + delay_seconds_description = "Control the delay time in seconds. Higher values result in a longer delay time." + infer_parser.add_argument( + "--delay_seconds", + type=float, + help=delay_seconds_description, + default=0.5, + required=False, + ) + delay_feedback_description = "Control the feedback of the delay effect. Higher values result in a stronger feedback effect." + infer_parser.add_argument( + "--delay_feedback", + type=float, + help=delay_feedback_description, + default=0.0, + required=False, + ) + delay_mix_description = "Control the mix of the delay effect. Higher values result in a stronger delay effect." + infer_parser.add_argument( + "--delay_mix", + type=float, + help=delay_mix_description, + default=0.5, + required=False, + ) + + # Parser for 'batch_infer' mode + batch_infer_parser = subparsers.add_parser( + "batch_infer", + help="Run batch inference", + ) + batch_infer_parser.add_argument( + "--pitch", + type=int, + help=pitch_description, + choices=range(-24, 25), + default=0, + ) + batch_infer_parser.add_argument( + "--filter_radius", + type=int, + help=filter_radius_description, + choices=range(11), + default=3, + ) + batch_infer_parser.add_argument( + "--index_rate", + type=float, + help=index_rate_description, + choices=[i / 100.0 for i in range(0, 101)], + default=0.3, + ) + batch_infer_parser.add_argument( + "--volume_envelope", + type=float, + help=volume_envelope_description, + choices=[i / 100.0 for i in range(0, 101)], + default=1, + ) + batch_infer_parser.add_argument( + "--protect", + type=float, + help=protect_description, + choices=[i / 1000.0 for i in range(0, 501)], + default=0.33, + ) + batch_infer_parser.add_argument( + "--hop_length", + type=int, + help=hop_length_description, + choices=range(1, 513), + default=128, + ) + batch_infer_parser.add_argument( + "--f0_method", + type=str, + help=f0_method_description, + choices=[ + "crepe", + "crepe-tiny", + "rmvpe", + "fcpe", + "hybrid[crepe+rmvpe]", + "hybrid[crepe+fcpe]", + "hybrid[rmvpe+fcpe]", + "hybrid[crepe+rmvpe+fcpe]", + ], + default="rmvpe", + ) + batch_infer_parser.add_argument( + "--input_folder", + type=str, + help="Path to the folder containing input audio files.", + required=True, + ) + batch_infer_parser.add_argument( + "--output_folder", + type=str, + help="Path to the folder for saving output audio files.", + required=True, + ) + batch_infer_parser.add_argument( + "--pth_path", type=str, help=pth_path_description, required=True + ) + batch_infer_parser.add_argument( + "--index_path", type=str, help=index_path_description, required=True + ) + batch_infer_parser.add_argument( + "--split_audio", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=split_audio_description, + default=False, + ) + batch_infer_parser.add_argument( + "--f0_autotune", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=f0_autotune_description, + default=False, + ) + batch_infer_parser.add_argument( + "--f0_autotune_strength", + type=float, + help=clean_strength_description, + choices=[(i / 10) for i in range(11)], + default=1.0, + ) + batch_infer_parser.add_argument( + "--clean_audio", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=clean_audio_description, + default=False, + ) + batch_infer_parser.add_argument( + "--clean_strength", + type=float, + help=clean_strength_description, + choices=[(i / 10) for i in range(11)], + default=0.7, + ) + batch_infer_parser.add_argument( + "--export_format", + type=str, + help=export_format_description, + choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], + default="WAV", + ) + batch_infer_parser.add_argument( + "--embedder_model", + type=str, + help=embedder_model_description, + choices=[ + "contentvec", + "chinese-hubert-base", + "japanese-hubert-base", + "korean-hubert-base", + "custom", + ], + default="contentvec", + ) + batch_infer_parser.add_argument( + "--embedder_model_custom", + type=str, + help=embedder_model_custom_description, + default=None, + ) + batch_infer_parser.add_argument( + "--f0_file", + type=str, + help=f0_file_description, + default=None, + ) + batch_infer_parser.add_argument( + "--formant_shifting", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=formant_shifting_description, + default=False, + required=False, + ) + batch_infer_parser.add_argument( + "--formant_qfrency", + type=float, + help=formant_qfrency_description, + default=1.0, + required=False, + ) + batch_infer_parser.add_argument( + "--formant_timbre", + type=float, + help=formant_timbre_description, + default=1.0, + required=False, + ) + batch_infer_parser.add_argument( + "--sid", + type=int, + help=sid_description, + default=0, + required=False, + ) + batch_infer_parser.add_argument( + "--post_process", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=post_process_description, + default=False, + required=False, + ) + batch_infer_parser.add_argument( + "--reverb", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=reverb_description, + default=False, + required=False, + ) + + batch_infer_parser.add_argument( + "--pitch_shift", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=pitch_shift_description, + default=False, + required=False, + ) + + batch_infer_parser.add_argument( + "--limiter", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=limiter_description, + default=False, + required=False, + ) + + batch_infer_parser.add_argument( + "--gain", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=gain_description, + default=False, + required=False, + ) + + batch_infer_parser.add_argument( + "--distortion", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=distortion_description, + default=False, + required=False, + ) + + batch_infer_parser.add_argument( + "--chorus", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=chorus_description, + default=False, + required=False, + ) + + batch_infer_parser.add_argument( + "--bitcrush", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=bitcrush_description, + default=False, + required=False, + ) + + batch_infer_parser.add_argument( + "--clipping", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=clipping_description, + default=False, + required=False, + ) + + batch_infer_parser.add_argument( + "--compressor", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=compressor_description, + default=False, + required=False, + ) + + batch_infer_parser.add_argument( + "--delay", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=delay_description, + default=False, + required=False, + ) + + batch_infer_parser.add_argument( + "--reverb_room_size", + type=float, + help=reverb_room_size_description, + default=0.5, + required=False, + ) + + batch_infer_parser.add_argument( + "--reverb_damping", + type=float, + help=reverb_damping_description, + default=0.5, + required=False, + ) + + batch_infer_parser.add_argument( + "--reverb_wet_gain", + type=float, + help=reverb_wet_gain_description, + default=0.5, + required=False, + ) + + batch_infer_parser.add_argument( + "--reverb_dry_gain", + type=float, + help=reverb_dry_gain_description, + default=0.5, + required=False, + ) + + batch_infer_parser.add_argument( + "--reverb_width", + type=float, + help=reverb_width_description, + default=0.5, + required=False, + ) + + batch_infer_parser.add_argument( + "--reverb_freeze_mode", + type=float, + help=reverb_freeze_mode_description, + default=0.5, + required=False, + ) + + batch_infer_parser.add_argument( + "--pitch_shift_semitones", + type=float, + help=pitch_shift_semitones_description, + default=0.0, + required=False, + ) + + batch_infer_parser.add_argument( + "--limiter_threshold", + type=float, + help=limiter_threshold_description, + default=-6, + required=False, + ) + + batch_infer_parser.add_argument( + "--limiter_release_time", + type=float, + help=limiter_release_time_description, + default=0.01, + required=False, + ) + batch_infer_parser.add_argument( + "--gain_db", + type=float, + help=gain_db_description, + default=0.0, + required=False, + ) + + batch_infer_parser.add_argument( + "--distortion_gain", + type=float, + help=distortion_gain_description, + default=25, + required=False, + ) + + batch_infer_parser.add_argument( + "--chorus_rate", + type=float, + help=chorus_rate_description, + default=1.0, + required=False, + ) + + batch_infer_parser.add_argument( + "--chorus_depth", + type=float, + help=chorus_depth_description, + default=0.25, + required=False, + ) + batch_infer_parser.add_argument( + "--chorus_center_delay", + type=float, + help=chorus_center_delay_description, + default=7, + required=False, + ) + + batch_infer_parser.add_argument( + "--chorus_feedback", + type=float, + help=chorus_feedback_description, + default=0.0, + required=False, + ) + + batch_infer_parser.add_argument( + "--chorus_mix", + type=float, + help=chorus_mix_description, + default=0.5, + required=False, + ) + + batch_infer_parser.add_argument( + "--bitcrush_bit_depth", + type=int, + help=bitcrush_bit_depth_description, + default=8, + required=False, + ) + + batch_infer_parser.add_argument( + "--clipping_threshold", + type=float, + help=clipping_threshold_description, + default=-6, + required=False, + ) + + batch_infer_parser.add_argument( + "--compressor_threshold", + type=float, + help=compressor_threshold_description, + default=0, + required=False, + ) + + batch_infer_parser.add_argument( + "--compressor_ratio", + type=float, + help=compressor_ratio_description, + default=1, + required=False, + ) + + batch_infer_parser.add_argument( + "--compressor_attack", + type=float, + help=compressor_attack_description, + default=1.0, + required=False, + ) + + batch_infer_parser.add_argument( + "--compressor_release", + type=float, + help=compressor_release_description, + default=100, + required=False, + ) + batch_infer_parser.add_argument( + "--delay_seconds", + type=float, + help=delay_seconds_description, + default=0.5, + required=False, + ) + batch_infer_parser.add_argument( + "--delay_feedback", + type=float, + help=delay_feedback_description, + default=0.0, + required=False, + ) + batch_infer_parser.add_argument( + "--delay_mix", + type=float, + help=delay_mix_description, + default=0.5, + required=False, + ) + + # Parser for 'tts' mode + tts_parser = subparsers.add_parser("tts", help="Run TTS inference") + tts_parser.add_argument( + "--tts_file", type=str, help="File with a text to be synthesized", required=True + ) + tts_parser.add_argument( + "--tts_text", type=str, help="Text to be synthesized", required=True + ) + tts_parser.add_argument( + "--tts_voice", + type=str, + help="Voice to be used for TTS synthesis.", + choices=locales, + required=True, + ) + tts_parser.add_argument( + "--tts_rate", + type=int, + help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).", + choices=range(-100, 101), + default=0, + ) + tts_parser.add_argument( + "--pitch", + type=int, + help=pitch_description, + choices=range(-24, 25), + default=0, + ) + tts_parser.add_argument( + "--filter_radius", + type=int, + help=filter_radius_description, + choices=range(11), + default=3, + ) + tts_parser.add_argument( + "--index_rate", + type=float, + help=index_rate_description, + choices=[(i / 10) for i in range(11)], + default=0.3, + ) + tts_parser.add_argument( + "--volume_envelope", + type=float, + help=volume_envelope_description, + choices=[(i / 10) for i in range(11)], + default=1, + ) + tts_parser.add_argument( + "--protect", + type=float, + help=protect_description, + choices=[(i / 10) for i in range(6)], + default=0.33, + ) + tts_parser.add_argument( + "--hop_length", + type=int, + help=hop_length_description, + choices=range(1, 513), + default=128, + ) + tts_parser.add_argument( + "--f0_method", + type=str, + help=f0_method_description, + choices=[ + "crepe", + "crepe-tiny", + "rmvpe", + "fcpe", + "hybrid[crepe+rmvpe]", + "hybrid[crepe+fcpe]", + "hybrid[rmvpe+fcpe]", + "hybrid[crepe+rmvpe+fcpe]", + ], + default="rmvpe", + ) + tts_parser.add_argument( + "--output_tts_path", + type=str, + help="Full path to save the synthesized TTS audio.", + required=True, + ) + tts_parser.add_argument( + "--output_rvc_path", + type=str, + help="Full path to save the voice-converted audio using the synthesized TTS.", + required=True, + ) + tts_parser.add_argument( + "--pth_path", type=str, help=pth_path_description, required=True + ) + tts_parser.add_argument( + "--index_path", type=str, help=index_path_description, required=True + ) + tts_parser.add_argument( + "--split_audio", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=split_audio_description, + default=False, + ) + tts_parser.add_argument( + "--f0_autotune", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=f0_autotune_description, + default=False, + ) + tts_parser.add_argument( + "--f0_autotune_strength", + type=float, + help=clean_strength_description, + choices=[(i / 10) for i in range(11)], + default=1.0, + ) + tts_parser.add_argument( + "--clean_audio", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + help=clean_audio_description, + default=False, + ) + tts_parser.add_argument( + "--clean_strength", + type=float, + help=clean_strength_description, + choices=[(i / 10) for i in range(11)], + default=0.7, + ) + tts_parser.add_argument( + "--export_format", + type=str, + help=export_format_description, + choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], + default="WAV", + ) + tts_parser.add_argument( + "--embedder_model", + type=str, + help=embedder_model_description, + choices=[ + "contentvec", + "chinese-hubert-base", + "japanese-hubert-base", + "korean-hubert-base", + "custom", + ], + default="contentvec", + ) + tts_parser.add_argument( + "--embedder_model_custom", + type=str, + help=embedder_model_custom_description, + default=None, + ) + tts_parser.add_argument( + "--f0_file", + type=str, + help=f0_file_description, + default=None, + ) + + # Parser for 'model_information' mode + model_information_parser = subparsers.add_parser( + "model_information", help="Display information about a trained model." + ) + model_information_parser.add_argument( + "--pth_path", type=str, help="Path to the .pth model file.", required=True + ) + + # Parser for 'model_blender' mode + model_blender_parser = subparsers.add_parser( + "model_blender", help="Fuse two RVC models together." + ) + model_blender_parser.add_argument( + "--model_name", type=str, help="Name of the new fused model.", required=True + ) + model_blender_parser.add_argument( + "--pth_path_1", + type=str, + help="Path to the first .pth model file.", + required=True, + ) + model_blender_parser.add_argument( + "--pth_path_2", + type=str, + help="Path to the second .pth model file.", + required=True, + ) + model_blender_parser.add_argument( + "--ratio", + type=float, + help="Ratio for blending the two models (0.0 to 1.0).", + choices=[(i / 10) for i in range(11)], + default=0.5, + ) + + # Parser for 'tensorboard' mode + subparsers.add_parser( + "tensorboard", help="Launch TensorBoard for monitoring training progress." + ) + + # Parser for 'download' mode + download_parser = subparsers.add_parser( + "download", help="Download a model from a provided link." + ) + download_parser.add_argument( + "--model_link", type=str, help="Direct link to the model file.", required=True + ) + + # Parser for 'prerequisites' mode + prerequisites_parser = subparsers.add_parser( + "prerequisites", help="Install prerequisites for RVC." + ) + prerequisites_parser.add_argument( + "--models", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + default=True, + help="Download additional models.", + ) + prerequisites_parser.add_argument( + "--exe", + type=lambda x: bool(strtobool(x)), + choices=[True, False], + default=True, + help="Download required executables.", + ) + + # Parser for 'audio_analyzer' mode + audio_analyzer = subparsers.add_parser( + "audio_analyzer", help="Analyze an audio file." + ) + audio_analyzer.add_argument( + "--input_path", type=str, help="Path to the input audio file.", required=True + ) + + return parser.parse_args() + + +def main(): + if len(sys.argv) == 1: + print("Please run the script with '-h' for more information.") + sys.exit(1) + + args = parse_arguments() + + try: + if args.mode == "infer": + run_infer_script( + pitch=args.pitch, + filter_radius=args.filter_radius, + index_rate=args.index_rate, + volume_envelope=args.volume_envelope, + protect=args.protect, + hop_length=args.hop_length, + f0_method=args.f0_method, + input_path=args.input_path, + output_path=args.output_path, + pth_path=args.pth_path, + index_path=args.index_path, + split_audio=args.split_audio, + f0_autotune=args.f0_autotune, + f0_autotune_strength=args.f0_autotune_strength, + clean_audio=args.clean_audio, + clean_strength=args.clean_strength, + export_format=args.export_format, + embedder_model=args.embedder_model, + embedder_model_custom=args.embedder_model_custom, + f0_file=args.f0_file, + formant_shifting=args.formant_shifting, + formant_qfrency=args.formant_qfrency, + formant_timbre=args.formant_timbre, + sid=args.sid, + post_process=args.post_process, + reverb=args.reverb, + pitch_shift=args.pitch_shift, + limiter=args.limiter, + gain=args.gain, + distortion=args.distortion, + chorus=args.chorus, + bitcrush=args.bitcrush, + clipping=args.clipping, + compressor=args.compressor, + delay=args.delay, + reverb_room_size=args.reverb_room_size, + reverb_damping=args.reverb_damping, + reverb_wet_gain=args.reverb_wet_gain, + reverb_dry_gain=args.reverb_dry_gain, + reverb_width=args.reverb_width, + reverb_freeze_mode=args.reverb_freeze_mode, + pitch_shift_semitones=args.pitch_shift_semitones, + limiter_threshold=args.limiter_threshold, + limiter_release_time=args.limiter_release_time, + gain_db=args.gain_db, + distortion_gain=args.distortion_gain, + chorus_rate=args.chorus_rate, + chorus_depth=args.chorus_depth, + chorus_center_delay=args.chorus_center_delay, + chorus_feedback=args.chorus_feedback, + chorus_mix=args.chorus_mix, + bitcrush_bit_depth=args.bitcrush_bit_depth, + clipping_threshold=args.clipping_threshold, + compressor_threshold=args.compressor_threshold, + compressor_ratio=args.compressor_ratio, + compressor_attack=args.compressor_attack, + compressor_release=args.compressor_release, + delay_seconds=args.delay_seconds, + delay_feedback=args.delay_feedback, + delay_mix=args.delay_mix, + ) + elif args.mode == "batch_infer": + run_batch_infer_script( + pitch=args.pitch, + filter_radius=args.filter_radius, + index_rate=args.index_rate, + volume_envelope=args.volume_envelope, + protect=args.protect, + hop_length=args.hop_length, + f0_method=args.f0_method, + input_folder=args.input_folder, + output_folder=args.output_folder, + pth_path=args.pth_path, + index_path=args.index_path, + split_audio=args.split_audio, + f0_autotune=args.f0_autotune, + f0_autotune_strength=args.f0_autotune_strength, + clean_audio=args.clean_audio, + clean_strength=args.clean_strength, + export_format=args.export_format, + embedder_model=args.embedder_model, + embedder_model_custom=args.embedder_model_custom, + f0_file=args.f0_file, + formant_shifting=args.formant_shifting, + formant_qfrency=args.formant_qfrency, + formant_timbre=args.formant_timbre, + sid=args.sid, + post_process=args.post_process, + reverb=args.reverb, + pitch_shift=args.pitch_shift, + limiter=args.limiter, + gain=args.gain, + distortion=args.distortion, + chorus=args.chorus, + bitcrush=args.bitcrush, + clipping=args.clipping, + compressor=args.compressor, + delay=args.delay, + reverb_room_size=args.reverb_room_size, + reverb_damping=args.reverb_damping, + reverb_wet_gain=args.reverb_wet_gain, + reverb_dry_gain=args.reverb_dry_gain, + reverb_width=args.reverb_width, + reverb_freeze_mode=args.reverb_freeze_mode, + pitch_shift_semitones=args.pitch_shift_semitones, + limiter_threshold=args.limiter_threshold, + limiter_release_time=args.limiter_release_time, + gain_db=args.gain_db, + distortion_gain=args.distortion_gain, + chorus_rate=args.chorus_rate, + chorus_depth=args.chorus_depth, + chorus_center_delay=args.chorus_center_delay, + chorus_feedback=args.chorus_feedback, + chorus_mix=args.chorus_mix, + bitcrush_bit_depth=args.bitcrush_bit_depth, + clipping_threshold=args.clipping_threshold, + compressor_threshold=args.compressor_threshold, + compressor_ratio=args.compressor_ratio, + compressor_attack=args.compressor_attack, + compressor_release=args.compressor_release, + delay_seconds=args.delay_seconds, + delay_feedback=args.delay_feedback, + delay_mix=args.delay_mix, + ) + elif args.mode == "tts": + run_tts_script( + tts_file=args.tts_file, + tts_text=args.tts_text, + tts_voice=args.tts_voice, + tts_rate=args.tts_rate, + pitch=args.pitch, + filter_radius=args.filter_radius, + index_rate=args.index_rate, + volume_envelope=args.volume_envelope, + protect=args.protect, + hop_length=args.hop_length, + f0_method=args.f0_method, + output_tts_path=args.output_tts_path, + output_rvc_path=args.output_rvc_path, + pth_path=args.pth_path, + index_path=args.index_path, + split_audio=args.split_audio, + f0_autotune=args.f0_autotune, + f0_autotune_strength=args.f0_autotune_strength, + clean_audio=args.clean_audio, + clean_strength=args.clean_strength, + export_format=args.export_format, + embedder_model=args.embedder_model, + embedder_model_custom=args.embedder_model_custom, + f0_file=args.f0_file, + ) + elif args.mode == "model_information": + run_model_information_script( + pth_path=args.pth_path, + ) + elif args.mode == "model_blender": + run_model_blender_script( + model_name=args.model_name, + pth_path_1=args.pth_path_1, + pth_path_2=args.pth_path_2, + ratio=args.ratio, + ) + elif args.mode == "tensorboard": + run_tensorboard_script() + elif args.mode == "download": + run_download_script( + model_link=args.model_link, + ) + elif args.mode == "audio_analyzer": + run_audio_analyzer_script( + input_path=args.input_path, + ) + except Exception as error: + print(f"An error occurred during execution: {error}") + + import traceback + + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/stftpitchshift b/stftpitchshift deleted file mode 100644 index 4f62e315..00000000 --- a/stftpitchshift +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb2f50ea8e5ca1a11a587f11f25ba9182f9b24e2367ac480f430b3f04062782e -size 1822104