From 1e0eaeba2becbf7a632f29971c96b1ac380a2af7 Mon Sep 17 00:00:00 2001 From: pgosar Date: Wed, 17 Apr 2024 16:32:50 -0500 Subject: [PATCH 1/7] add files --- inference_demo.py | 1 + speech_editing_demo.py | 1 + 2 files changed, 2 insertions(+) create mode 100644 inference_demo.py create mode 100644 speech_editing_demo.py diff --git a/inference_demo.py b/inference_demo.py new file mode 100644 index 00000000..85e6ff19 --- /dev/null +++ b/inference_demo.py @@ -0,0 +1 @@ +# WIP diff --git a/speech_editing_demo.py b/speech_editing_demo.py new file mode 100644 index 00000000..85e6ff19 --- /dev/null +++ b/speech_editing_demo.py @@ -0,0 +1 @@ +# WIP From 63736f72696f1492beb7e0406c4e7a736a9f2eb1 Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Tue, 23 Apr 2024 13:01:44 -0500 Subject: [PATCH 2/7] add TTS --- inference_demo.py | 185 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 184 insertions(+), 1 deletion(-) diff --git a/inference_demo.py b/inference_demo.py index 85e6ff19..bf142941 100644 --- a/inference_demo.py +++ b/inference_demo.py @@ -1 +1,184 @@ -# WIP +""" +This script will allow you to run TTS inference with Voicecraft +Before getting started, be sure to follow the environment setup. +""" + +from inference_tts_scale import inference_one_sample +from models import voicecraft +from data.tokenizer import ( + AudioTokenizer, + TextTokenizer, +) +from IPython.display import display, Audio +import argparse +import random +import numpy as np +import torchaudio +import torch +import os +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["USER"] = "me" # TODO change this to your username + +device = "cuda" if torch.cuda.is_available() else "cpu" + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="VoiceCraft Inference: see the script for more information on the options") + + parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[ + "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], + help="VoiceCraft model to use") + parser.add_argument("--codec_audio_sr", type=int, + default=16000, help="Audio sampling rate for the codec") + parser.add_argument("--codec_sr", type=int, default=50, + help="Sampling rate for the codec") + parser.add_argument("--top_k", type=int, default=0, + help="Top-k sampling value") + parser.add_argument("--top_p", type=float, default=0.9, + help="Top-p sampling value") + parser.add_argument("--temperature", type=float, + default=1.0, help="Temperature for sampling") + parser.add_argument("--silence_tokens", type=int, nargs="*", + default=[1388, 1898, 131], help="Silence token IDs") + parser.add_argument("--kvcache", type=int, default=1, + help="Key-value cache flag (0 or 1)") + parser.add_argument("--stop_repetition", type=int, + default=3, help="Stop repetition for generation") + parser.add_argument("--sample_batch_size", type=int, + default=3, help="Batch size for sampling") + parser.add_argument("--seed", type=int, default=1, + help="Random seed for reproducibility") + parser.add_argument("--output_dir", type=str, default="./generated_tts", + help="directory to save generated audio") + parser.add_argument("--original_audio", type=str, + default="./demo/84_121550_000074_000000.wav", help="location of target audio file") + parser.add_argument("--original_transcript", type=str, + default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", + help="original audio transcript") + parser.add_argument("--target_transcript", type=str, + default="Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!", + help="target audio transcript") + parser.add_argument("--cut_off_sec", type=float, default=3.6, + help="cut off point in seconds for input prompt") + args = parser.parse_args() + return args + + +args = parse_arguments() + +voicecraft_name = args.model_name +# hyperparameters for inference +codec_audio_sr = args.codec_audio_sr +codec_sr = args.codec_sr +top_k = args.top_k +top_p = args.top_p # defaults to 0.9 can also try 0.8, but 0.9 seems to work better +temperature = args.temperature +silence_tokens = args.silence_tokens +kvcache = args.kvcache # NOTE if OOM, change this to 0, or try the 330M model + +# NOTE adjust the below three arguments if the generation is not as good +# NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1 +stop_repetition = args.stop_repetition + +# NOTE: if the if there are long silence or unnaturally strecthed words, +# increase sample_batch_size to 4 or higher. What this will do to the model is that the +# model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. +# So if the speech rate of the generated is too fast change it to a smaller number. +sample_batch_size = args.sample_batch_size +seed = args.seed # change seed if you are still unhappy with the result + +# load the model +model = voicecraft.VoiceCraft.from_pretrained( + f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}") +phn2num = model.args.phn2num +config = vars(model.args) +model.to(device) + +encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th" +if not os.path.exists(encodec_fn): + os.system( + f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th") + os.system( + f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th") +# will also put the neural codec model on gpu +audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) + +text_tokenizer = TextTokenizer(backend="espeak") + +# Prepare your audio +# point to the original audio whose speech you want to clone +# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file +orig_audio = args.original_audio +orig_transcript = args.original_transcript + +# move the audio and transcript to temp folder +temp_folder = "./demo/temp" +os.makedirs(temp_folder, exist_ok=True) +os.system(f"cp {orig_audio} {temp_folder}") +filename = os.path.splitext(orig_audio.split("/")[-1])[0] +with open(f"{temp_folder}/{filename}.txt", "w") as f: + f.write(orig_transcript) +# run MFA to get the alignment +align_temp = f"{temp_folder}/mfa_alignments" + +os.system("source ~/.bashrc && \ + conda activate voicecraft && \ + mfa align -v --clean -j 1 --output_format csv {temp_folder} \ + english_us_arpa english_us_arpa {align_temp}" + ) + +# # if the above fails, it could be because the audio is too hard for the alignment model, +# increasing the beam size usually solves the issue +# os.system("source ~/.bashrc && \ +# conda activate voicecraft && \ +# mfa align -v --clean -j 1 --output_format csv {temp_folder} \ +# english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000") + +# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt +cut_off_sec = args.cut_off_sec # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio +target_transcript = args.target_transcript +# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec. +audio_fn = f"{temp_folder}/{filename}.wav" +info = torchaudio.info(audio_fn) +audio_dur = info.num_frames / info.sample_rate + +assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}" +prompt_end_frame = int(cut_off_sec * info.sample_rate) + +# run the model to get the output + + +def seed_everything(seed): + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + +seed_everything(seed) + +decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, + "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size} +concated_audio, gen_audio = inference_one_sample(model, argparse.Namespace( + **config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame) + +# save segments for comparison +concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu() +# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}") + +# save the audio +# output_dir +output_dir = args.output_dir +os.makedirs(output_dir, exist_ok=True) +seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav" +seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav" + +torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr) +torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr) + +# you might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored From b8bb2ab592f6146d882f16cd7aab0869aea45889 Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Tue, 23 Apr 2024 15:25:43 -0500 Subject: [PATCH 3/7] add beam size cmd args --- inference_demo.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/inference_demo.py b/inference_demo.py index bf142941..7e438ae9 100644 --- a/inference_demo.py +++ b/inference_demo.py @@ -50,6 +50,11 @@ def parse_arguments(): default=3, help="Batch size for sampling") parser.add_argument("--seed", type=int, default=1, help="Random seed for reproducibility") + parser.add_argument("--beam_size", type=int, default=10, + help="beam size for MFA alignment") + parser.add_argument("--retry_beam_size", type=int, default=40, + help="retry beam size for MFA alignment") + parser.add_argument("--output_dir", type=str, default="./generated_tts", help="directory to save generated audio") parser.add_argument("--original_audio", type=str, @@ -67,7 +72,6 @@ def parse_arguments(): args = parse_arguments() - voicecraft_name = args.model_name # hyperparameters for inference codec_audio_sr = args.codec_audio_sr @@ -122,19 +126,15 @@ def parse_arguments(): f.write(orig_transcript) # run MFA to get the alignment align_temp = f"{temp_folder}/mfa_alignments" - +beam_size = args.beam_size +retry_beam_size = args.retry_beam_size os.system("source ~/.bashrc && \ conda activate voicecraft && \ mfa align -v --clean -j 1 --output_format csv {temp_folder} \ - english_us_arpa english_us_arpa {align_temp}" + english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}" ) - -# # if the above fails, it could be because the audio is too hard for the alignment model, +# if the above fails, it could be because the audio is too hard for the alignment model, # increasing the beam size usually solves the issue -# os.system("source ~/.bashrc && \ -# conda activate voicecraft && \ -# mfa align -v --clean -j 1 --output_format csv {temp_folder} \ -# english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000") # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt cut_off_sec = args.cut_off_sec # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio From 59877c085e49b61cb441bb0624b3fbd842da2d43 Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Tue, 23 Apr 2024 18:38:09 -0500 Subject: [PATCH 4/7] add speech editing --- inference_demo.py | 16 ++- speech_editing_demo.py | 220 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 225 insertions(+), 11 deletions(-) diff --git a/inference_demo.py b/inference_demo.py index 7e438ae9..7c9f62e5 100644 --- a/inference_demo.py +++ b/inference_demo.py @@ -9,7 +9,6 @@ AudioTokenizer, TextTokenizer, ) -from IPython.display import display, Audio import argparse import random import numpy as np @@ -25,7 +24,7 @@ def parse_arguments(): parser = argparse.ArgumentParser( - description="VoiceCraft Inference: see the script for more information on the options") + description="VoiceCraft TTS Inference: see the script for more information on the options") parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[ "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], @@ -34,15 +33,15 @@ def parse_arguments(): default=16000, help="Audio sampling rate for the codec") parser.add_argument("--codec_sr", type=int, default=50, help="Sampling rate for the codec") - parser.add_argument("--top_k", type=int, default=0, - help="Top-k sampling value") + parser.add_argument("--top_k", type=float, default=0, + help="Top-k value") parser.add_argument("--top_p", type=float, default=0.9, - help="Top-p sampling value") + help="Top-p value") parser.add_argument("--temperature", type=float, default=1.0, help="Temperature for sampling") parser.add_argument("--silence_tokens", type=int, nargs="*", default=[1388, 1898, 131], help="Silence token IDs") - parser.add_argument("--kvcache", type=int, default=1, + parser.add_argument("--kvcache", type=int, default=1, choices=[0, 1], help="Key-value cache flag (0 or 1)") parser.add_argument("--stop_repetition", type=int, default=3, help="Stop repetition for generation") @@ -54,7 +53,6 @@ def parse_arguments(): help="beam size for MFA alignment") parser.add_argument("--retry_beam_size", type=int, default=40, help="retry beam size for MFA alignment") - parser.add_argument("--output_dir", type=str, default="./generated_tts", help="directory to save generated audio") parser.add_argument("--original_audio", type=str, @@ -147,9 +145,6 @@ def parse_arguments(): assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}" prompt_end_frame = int(cut_off_sec * info.sample_rate) -# run the model to get the output - - def seed_everything(seed): os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) @@ -162,6 +157,7 @@ def seed_everything(seed): seed_everything(seed) +# inference decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size} concated_audio, gen_audio = inference_one_sample(model, argparse.Namespace( diff --git a/speech_editing_demo.py b/speech_editing_demo.py index 85e6ff19..220d342e 100644 --- a/speech_editing_demo.py +++ b/speech_editing_demo.py @@ -1 +1,219 @@ -# WIP +""" +This script will allow you to run Speech Editing inference with Voicecraft +Before getting started, be sure to follow the environment setup. +""" + +from inference_speech_editing_scale import inference_one_sample, get_mask_interval +from edit_utils import get_span +from models import voicecraft +from data.tokenizer import ( + AudioTokenizer, + TextTokenizer, +) +import argparse +import random +import numpy as np +import torchaudio +import torch +import os +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["USER"] = "me" # TODO change this to your username + +device = "cuda" if torch.cuda.is_available() else "cpu" + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="VoiceCraft Speech Editing: see the script for more information on the options") + + parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[ + "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], + help="VoiceCraft model to use") + parser.add_argument("--silence_tokens", type=int, nargs="*", + default=[1388, 1898, 131], help="Silence token IDs") + parser.add_argument("--left_margin", type=float, + default=0.08, help="Left margin value.") + parser.add_argument("--right_margin", type=float, + default=0.08, help="Right margin value.") + parser.add_argument("--codec_audio_sr", type=int, + default=16000, help="Codec audio sample rate.") + parser.add_argument("--codec_sr", type=int, default=50, + help="Codec sample rate.") + parser.add_argument("--top_k", type=float, default=0, help="Top k value.") + parser.add_argument("--top_p", type=float, + default=0.8, help="Top p value.") + parser.add_argument("--temperature", type=float, + default=1, help="Temperature value.") + parser.add_argument("--kvcache", type=float, + default=0, help="Kvcache value.") + parser.add_argument("--seed", type=int, default=1, help="Seed value.") + parser.add_argument("--beam_size", type=int, default=10, + help="beam size for MFA alignment") + parser.add_argument("--retry_beam_size", type=int, default=40, + help="retry beam size for MFA alignment") + parser.add_argument("--original_audio", type=str, + default="./demo/84_121550_000074_000000.wav", help="location of audio file") + parser.add_argument("--stop_repetition", type=int, + default=-1, help="Stop repetition for generation") + parser.add_argument("--original_transcript", type=str, + default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", + help="original transcript") + parser.add_argument("--target_transcript", type=str, + default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,", + help="target transcript") + parser.add_argument("--edit_type", type=str, + default="substitution", + choices=["insertion", "substitution", "deletion"], + help="type of specified edit") + parser.add_argument("--output_dir", type=str, + default="./demo/generated_se", help="output directory") + args = parser.parse_args() + return args + + +args = parse_arguments() + +voicecraft_name = args.model_name + +# hyperparameters for inference +left_margin = args.left_margin +right_margin = args.right_margin +codec_audio_sr = args.codec_audio_sr +codec_sr = args.codec_sr +top_k = args.top_k +top_p = args.top_p +temperature = args.temperature +kvcache = args.kvcache +# NOTE: adjust the below three arguments if the generation is not as good +seed = args.seed # random seed magic +silence_tokens = args.silence_tokens +# if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1 +stop_repetition = args.stop_repetition +# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest + + +def seed_everything(seed): + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + +seed_everything(seed) +device = "cuda" if torch.cuda.is_available() else "cpu" +# or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth +model = voicecraft.VoiceCraft.from_pretrained( + f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}") +phn2num = model.args.phn2num +config = vars(model.args) +model.to(device) + +encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th" +if not os.path.exists(encodec_fn): + os.system( + f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th") + os.system( + f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th") +# will also put the neural codec model on gpu +audio_tokenizer = AudioTokenizer(signature=encodec_fn) + +text_tokenizer = TextTokenizer(backend="espeak") + +# point to the original file or record the file +# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file +orig_audio = args.original_audio +orig_transcript = args.original_transcript +# move the audio and transcript to temp folder +temp_folder = "./demo/temp" +os.makedirs(temp_folder, exist_ok=True) +os.system(f"cp {orig_audio} {temp_folder}") +filename = os.path.splitext(orig_audio.split("/")[-1])[0] +with open(f"{temp_folder}/{filename}.txt", "w") as f: + f.write(orig_transcript) +# run MFA to get the alignment +align_temp = f"{temp_folder}/mfa_alignments" +os.makedirs(align_temp, exist_ok=True) +beam_size = args.beam_size +retry_beam_size = args.retry_beam_size + +os.system("source ~/.bashrc && \ + conda activate voicecraft && \ + mfa align -v --clean -j 1 --output_format csv {temp_folder} \ + english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}" + ) +# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue +# os.system(f"mfa align -j 1 --clean --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000") +audio_fn = f"{temp_folder}/{filename}.wav" +transcript_fn = f"{temp_folder}/{filename}.txt" +align_fn = f"{align_temp}/{filename}.csv" + +# propose what do you want the target modified transcript to be +target_transcript = args.target_transcript +edit_type = args.edit_type + +# if you want to do a second modification on top of the first one, write down the second modification (target_transcript2, type_of_modification2) +# make sure the two modification do not overlap, if they do, you need to combine them into one modification + +# run the script to turn user input to the format that the model can take +orig_span, new_span = get_span(orig_transcript, target_transcript, edit_type) +if orig_span[0] > orig_span[1]: + RuntimeError(f"example {audio_fn} failed") +if orig_span[0] == orig_span[1]: + orig_span_save = [orig_span[0]] +else: + orig_span_save = orig_span +if new_span[0] == new_span[1]: + new_span_save = [new_span[0]] +else: + new_span_save = new_span + +orig_span_save = ",".join([str(item) for item in orig_span_save]) +new_span_save = ",".join([str(item) for item in new_span_save]) + +start, end = get_mask_interval(align_fn, orig_span_save, edit_type) +info = torchaudio.info(audio_fn) +audio_dur = info.num_frames / info.sample_rate +morphed_span = (max(start - left_margin, 1/codec_sr), + min(end + right_margin, audio_dur)) # in seconds + +# span in codec frames +mask_interval = [[round(morphed_span[0]*codec_sr), + round(morphed_span[1]*codec_sr)]] +mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now + +# run the model to get the output + +decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, + 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens} +orig_audio, new_audio = inference_one_sample(model, argparse.Namespace( + **config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config) + +# save segments for comparison +orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu() +# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}") + +# save the audio +output_dir = args.output_dir +os.makedirs(output_dir, exist_ok=True) + +save_fn_new = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav" + +torchaudio.save(save_fn_new, new_audio, codec_audio_sr) + +save_fn_orig = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav" +if not os.path.isfile(save_fn_orig): + orig_audio, orig_sr = torchaudio.load(audio_fn) + if orig_sr != codec_audio_sr: + orig_audio = torchaudio.transforms.Resample( + orig_sr, codec_audio_sr)(orig_audio) + torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr) + +# # if you get error importing T5 in transformers +# # try +# # pip uninstall Pillow +# # pip install Pillow +# # you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored From 1850da92104c264ea96b84b0cc886c10608127a7 Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Tue, 23 Apr 2024 18:55:34 -0500 Subject: [PATCH 5/7] add short form commands --- inference_demo.py | 58 ++++++++++++++++++++++-------------------- speech_editing_demo.py | 41 ++++++++++++++--------------- 2 files changed, 51 insertions(+), 48 deletions(-) diff --git a/inference_demo.py b/inference_demo.py index 7c9f62e5..86cd506c 100644 --- a/inference_demo.py +++ b/inference_demo.py @@ -26,44 +26,45 @@ def parse_arguments(): parser = argparse.ArgumentParser( description="VoiceCraft TTS Inference: see the script for more information on the options") - parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[ + parser.add_argument("-m", "--model_name", type=str, default="giga330M.pth", choices=[ "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], help="VoiceCraft model to use") - parser.add_argument("--codec_audio_sr", type=int, - default=16000, help="Audio sampling rate for the codec") - parser.add_argument("--codec_sr", type=int, default=50, - help="Sampling rate for the codec") - parser.add_argument("--top_k", type=float, default=0, - help="Top-k value") - parser.add_argument("--top_p", type=float, default=0.9, - help="Top-p value") - parser.add_argument("--temperature", type=float, - default=1.0, help="Temperature for sampling") - parser.add_argument("--silence_tokens", type=int, nargs="*", + parser.add_argument("-st", "--silence_tokens", type=int, nargs="*", default=[1388, 1898, 131], help="Silence token IDs") - parser.add_argument("--kvcache", type=int, default=1, choices=[0, 1], - help="Key-value cache flag (0 or 1)") - parser.add_argument("--stop_repetition", type=int, - default=3, help="Stop repetition for generation") + parser.add_argument("-casr", "--codec_audio_sr", type=int, + default=16000, help="Codec audio sample rate.") + parser.add_argument("-csr", "--codec_sr", type=int, default=50, + help="Codec sample rate.") + + parser.add_argument("-k", "--top_k", type=float, + default=0, help="Top k value.") + parser.add_argument("-p", "--top_p", type=float, + default=0.8, help="Top p value.") + parser.add_argument("-t", "--temperature", type=float, + default=1, help="Temperature value.") + parser.add_argument("-kv", "--kvcache", type=float, choices=[0, 1], + default=0, help="Kvcache value.") + parser.add_argument("-sr", "--stop_repetition", type=int, + default=-1, help="Stop repetition for generation") parser.add_argument("--sample_batch_size", type=int, default=3, help="Batch size for sampling") - parser.add_argument("--seed", type=int, default=1, - help="Random seed for reproducibility") - parser.add_argument("--beam_size", type=int, default=10, + parser.add_argument("-s", "--seed", type=int, + default=1, help="Seed value.") + parser.add_argument("-bs", "--beam_size", type=int, default=10, help="beam size for MFA alignment") - parser.add_argument("--retry_beam_size", type=int, default=40, + parser.add_argument("-rbs", "--retry_beam_size", type=int, default=40, help="retry beam size for MFA alignment") parser.add_argument("--output_dir", type=str, default="./generated_tts", help="directory to save generated audio") - parser.add_argument("--original_audio", type=str, - default="./demo/84_121550_000074_000000.wav", help="location of target audio file") - parser.add_argument("--original_transcript", type=str, + parser.add_argument("-oa", "--original_audio", type=str, + default="./demo/84_121550_000074_000000.wav", help="location of audio file") + parser.add_argument("-ot", "--original_transcript", type=str, default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", - help="original audio transcript") - parser.add_argument("--target_transcript", type=str, - default="Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!", - help="target audio transcript") - parser.add_argument("--cut_off_sec", type=float, default=3.6, + help="original transcript") + parser.add_argument("-tt", "--target_transcript", type=str, + default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,", + help="target transcript") + parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6, help="cut off point in seconds for input prompt") args = parser.parse_args() return args @@ -145,6 +146,7 @@ def parse_arguments(): assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}" prompt_end_frame = int(cut_off_sec * info.sample_rate) + def seed_everything(seed): os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) diff --git a/speech_editing_demo.py b/speech_editing_demo.py index 220d342e..99c24f52 100644 --- a/speech_editing_demo.py +++ b/speech_editing_demo.py @@ -27,46 +27,47 @@ def parse_arguments(): parser = argparse.ArgumentParser( description="VoiceCraft Speech Editing: see the script for more information on the options") - parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[ + parser.add_argument("-m", "--model_name", type=str, default="giga330M.pth", choices=[ "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], help="VoiceCraft model to use") - parser.add_argument("--silence_tokens", type=int, nargs="*", + parser.add_argument("-st", "--silence_tokens", type=int, nargs="*", default=[1388, 1898, 131], help="Silence token IDs") - parser.add_argument("--left_margin", type=float, + parser.add_argument("-lm", "--left_margin", type=float, default=0.08, help="Left margin value.") - parser.add_argument("--right_margin", type=float, + parser.add_argument("-rm", "--right_margin", type=float, default=0.08, help="Right margin value.") - parser.add_argument("--codec_audio_sr", type=int, + parser.add_argument("-casr", "--codec_audio_sr", type=int, default=16000, help="Codec audio sample rate.") - parser.add_argument("--codec_sr", type=int, default=50, + parser.add_argument("-csr", "--codec_sr", type=int, default=50, help="Codec sample rate.") - parser.add_argument("--top_k", type=float, default=0, help="Top k value.") - parser.add_argument("--top_p", type=float, + parser.add_argument("-k", "--top_k", type=float, + default=0, help="Top k value.") + parser.add_argument("-p", "--top_p", type=float, default=0.8, help="Top p value.") - parser.add_argument("--temperature", type=float, + parser.add_argument("-t", "--temperature", type=float, default=1, help="Temperature value.") - parser.add_argument("--kvcache", type=float, + parser.add_argument("-kv", "--kvcache", type=float, choices=[0, 1], default=0, help="Kvcache value.") - parser.add_argument("--seed", type=int, default=1, help="Seed value.") - parser.add_argument("--beam_size", type=int, default=10, + parser.add_argument("-sr", "--stop_repetition", type=int, + default=-1, help="Stop repetition for generation") + parser.add_argument("-s", "--seed", type=int, default=1, help="Seed value.") + parser.add_argument("-bs", "--beam_size", type=int, default=10, help="beam size for MFA alignment") - parser.add_argument("--retry_beam_size", type=int, default=40, + parser.add_argument("-rbs", "--retry_beam_size", type=int, default=40, help="retry beam size for MFA alignment") - parser.add_argument("--original_audio", type=str, + parser.add_argument("-oa", "--original_audio", type=str, default="./demo/84_121550_000074_000000.wav", help="location of audio file") - parser.add_argument("--stop_repetition", type=int, - default=-1, help="Stop repetition for generation") - parser.add_argument("--original_transcript", type=str, + parser.add_argument("-ot", "--original_transcript", type=str, default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", help="original transcript") - parser.add_argument("--target_transcript", type=str, + parser.add_argument("-tt", "--target_transcript", type=str, default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,", help="target transcript") - parser.add_argument("--edit_type", type=str, + parser.add_argument("-et", "--edit_type", type=str, default="substitution", choices=["insertion", "substitution", "deletion"], help="type of specified edit") - parser.add_argument("--output_dir", type=str, + parser.add_argument("-o", "--output_dir", type=str, default="./demo/generated_se", help="output directory") args = parser.parse_args() return args From 9fb6d948d0d86a624b815288afda92bac5fe839a Mon Sep 17 00:00:00 2001 From: pgosar Date: Tue, 23 Apr 2024 19:07:24 -0500 Subject: [PATCH 6/7] add simple running instructions --- README.md | 19 ++++++++++++++----- inference_demo.py => tts_demo.py | 0 2 files changed, 14 insertions(+), 5 deletions(-) rename inference_demo.py => tts_demo.py (100%) diff --git a/README.md b/README.md index d7c6d867..129c0bff 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,8 @@ There are three ways (besides running Gradio in Colab): 1. More flexible inference beyond Gradio UI in Google Colab. see [quickstart colab](#quickstart-colab) 2. with docker. see [quickstart docker](#quickstart-docker) 3. without docker. see [environment setup](#environment-setup). You can also run gradio locally if you choose this option +4. As a standalone script that you can easily integrate into other projects. +see [quickstart command line](#quickstart-command-line). When you are inside the docker image or you have installed all dependencies, Checkout [`inference_tts.ipynb`](./inference_tts.ipynb). @@ -21,7 +23,7 @@ If you want to do model development such as training/finetuning, I recommend fol ## News :star: 04/22/2024: 330M/830M TTS Enhanced Models are up [here](https://huggingface.co/pyp1), load them through [`gradio_app.py`](./gradio_app.py) or [`inference_tts.ipynb`](./inference_tts.ipynb)! Replicate demo is up, major thanks to [@chenxwh](https://github.com/chenxwh)! -:star: 04/11/2024: VoiceCraft Gradio is now available on HuggingFace Spaces [here](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio)! Major thanks to [@zuev-stepan](https://github.com/zuev-stepan), [@Sewlell](https://github.com/Sewlell), [@pgsoar](https://github.com/pgosar) [@Ph0rk0z](https://github.com/Ph0rk0z). +:star: 04/11/2024: VoiceCraft Gradio is now available on HuggingFace Spaces [here](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio)! Major thanks to [@zuev-stepan](https://github.com/zuev-stepan), [@Sewlell](https://github.com/Sewlell), [@pgsoar](https://github.com/pgosar) [@Ph0rk0z](https://github.com/Ph0rk0z). :star: 04/05/2024: I finetuned giga330M with the TTS objective on gigaspeech and 1/5 of librilight. Weights are [here](https://huggingface.co/pyp1/VoiceCraft/tree/main). Make sure maximal prompt + generation length <= 16 seconds (due to our limited compute, we had to drop utterances longer than 16s in training data). Even stronger models forthcomming, stay tuned! @@ -37,11 +39,9 @@ If you want to do model development such as training/finetuning, I recommend fol - [x] Better guidance on training/finetuning - [x] Colab notebooks - [x] HuggingFace Spaces demo -- [ ] Command line +- [x] Command line - [ ] Improve efficiency - - ## QuickStart Colab :star: To try out speech editing or TTS Inference with VoiceCraft, the simplest way is using Google Colab. @@ -50,6 +50,15 @@ Instructions to run are on the Colab itself. 1. To try [Speech Editing](https://colab.research.google.com/drive/1FV7EC36dl8UioePY1xXijXTMl7X47kR_?usp=sharing) 2. To try [TTS Inference](https://colab.research.google.com/drive/1lch_6it5-JpXgAQlUTRRI2z2_rk5K67Z?usp=sharing) +## QuickStart Command Line + +:star: To use it as a standalone script, check out tts_demo.py and speech_editing_demo.py. +Be sure to first [setup your environment](#environment-setup). +Without arguments, they will run the standard demo arguments used as an example elsewhere +in this repository. You can use the command line arguments to specify unique input audios, +target transcripts, and inference hyperparameters. Run the help command for more information: +`python3 tts_demo.py -h` and `python3 speech-editing_demo.py -h` + ## QuickStart Docker :star: To try out TTS inference with VoiceCraft, you can also use docker. Thank [@ubergarm](https://github.com/ubergarm) and [@jayc88](https://github.com/jay-c88) for making this happen. @@ -197,7 +206,7 @@ cd ./z_scripts bash e830M.sh ``` -It's the same procedure to prepare your own custom dataset. Make sure that if +It's the same procedure to prepare your own custom dataset. Make sure that if ## Finetuning You also need to do step 1-4 as Training, and I recommend to use AdamW for optimization if you finetune a pretrained model for better stability. checkout script `./z_scripts/e830M_ft.sh`. diff --git a/inference_demo.py b/tts_demo.py similarity index 100% rename from inference_demo.py rename to tts_demo.py From 1a896d21fe2866cc99707140c2b533d0eef2c7ce Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Fri, 3 May 2024 22:16:06 -0500 Subject: [PATCH 7/7] adjust cut off sec and target transcript --- tts_demo.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/tts_demo.py b/tts_demo.py index 86cd506c..6f8c3478 100644 --- a/tts_demo.py +++ b/tts_demo.py @@ -62,10 +62,13 @@ def parse_arguments(): default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", help="original transcript") parser.add_argument("-tt", "--target_transcript", type=str, - default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,", + default="object was seen as a mirage in the lake in the distance,", help="target transcript") parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6, help="cut off point in seconds for input prompt") + parser.add_argument("-ma", "--margin", type=float, default=0.07, + help="lowest margin in seconds between words for input prompt") + args = parser.parse_args() return args @@ -135,11 +138,33 @@ def parse_arguments(): # if the above fails, it could be because the audio is too hard for the alignment model, # increasing the beam size usually solves the issue +def find_closest_word_boundary(alignments, cut_off_sec, margin): + with open(alignments, 'r') as file: + # skip header + next(file) + prev_end = 0.0 + cutoff_time = None + cutoff_index = None + for i, line in enumerate(file): + end = float(line.strip().split(',')[1]) + if end >= cut_off_sec and end - prev_end >= margin: + cutoff_time = end + margin / 2 + cutoff_index = i + break + + prev_end = end + + return cutoff_time, cutoff_index + # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt -cut_off_sec = args.cut_off_sec # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio -target_transcript = args.target_transcript -# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec. +# NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio +cut_off_sec = args.cut_off_sec +margin = args.margin audio_fn = f"{temp_folder}/{filename}.wav" +alignments = f"{temp_folder}/mfa_alignments/{filename}.csv" +cut_off_sec, cut_off_word_idx = find_closest_word_boundary(alignments, cut_off_sec, margin) +target_transcript = " ".join(orig_transcript.split(" ")[:cut_off_word_idx]) + " " + args.target_transcript +# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec. info = torchaudio.info(audio_fn) audio_dur = info.num_frames / info.sample_rate