From 1e0eaeba2becbf7a632f29971c96b1ac380a2af7 Mon Sep 17 00:00:00 2001
From: pgosar <pgosarsteam@gmail.com>
Date: Wed, 17 Apr 2024 16:32:50 -0500
Subject: [PATCH 1/7] add files

---
 inference_demo.py      | 1 +
 speech_editing_demo.py | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 inference_demo.py
 create mode 100644 speech_editing_demo.py

diff --git a/inference_demo.py b/inference_demo.py
new file mode 100644
index 00000000..85e6ff19
--- /dev/null
+++ b/inference_demo.py
@@ -0,0 +1 @@
+# WIP
diff --git a/speech_editing_demo.py b/speech_editing_demo.py
new file mode 100644
index 00000000..85e6ff19
--- /dev/null
+++ b/speech_editing_demo.py
@@ -0,0 +1 @@
+# WIP

From 63736f72696f1492beb7e0406c4e7a736a9f2eb1 Mon Sep 17 00:00:00 2001
From: Pranay Gosar <pgosarsteam@gmail.com>
Date: Tue, 23 Apr 2024 13:01:44 -0500
Subject: [PATCH 2/7] add TTS

---
 inference_demo.py | 185 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 184 insertions(+), 1 deletion(-)

diff --git a/inference_demo.py b/inference_demo.py
index 85e6ff19..bf142941 100644
--- a/inference_demo.py
+++ b/inference_demo.py
@@ -1 +1,184 @@
-# WIP
+"""
+This script will allow you to run TTS inference with Voicecraft
+Before getting started, be sure to follow the environment setup.
+"""
+
+from inference_tts_scale import inference_one_sample
+from models import voicecraft
+from data.tokenizer import (
+    AudioTokenizer,
+    TextTokenizer,
+)
+from IPython.display import display, Audio
+import argparse
+import random
+import numpy as np
+import torchaudio
+import torch
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["USER"] = "me"  # TODO change this to your username
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="VoiceCraft Inference: see the script for more information on the options")
+
+    parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[
+                        "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"],
+                        help="VoiceCraft model to use")
+    parser.add_argument("--codec_audio_sr", type=int,
+                        default=16000, help="Audio sampling rate for the codec")
+    parser.add_argument("--codec_sr", type=int, default=50,
+                        help="Sampling rate for the codec")
+    parser.add_argument("--top_k", type=int, default=0,
+                        help="Top-k sampling value")
+    parser.add_argument("--top_p", type=float, default=0.9,
+                        help="Top-p sampling value")
+    parser.add_argument("--temperature", type=float,
+                        default=1.0, help="Temperature for sampling")
+    parser.add_argument("--silence_tokens", type=int, nargs="*",
+                        default=[1388, 1898, 131], help="Silence token IDs")
+    parser.add_argument("--kvcache", type=int, default=1,
+                        help="Key-value cache flag (0 or 1)")
+    parser.add_argument("--stop_repetition", type=int,
+                        default=3, help="Stop repetition for generation")
+    parser.add_argument("--sample_batch_size", type=int,
+                        default=3, help="Batch size for sampling")
+    parser.add_argument("--seed", type=int, default=1,
+                        help="Random seed for reproducibility")
+    parser.add_argument("--output_dir", type=str, default="./generated_tts",
+                        help="directory to save generated audio")
+    parser.add_argument("--original_audio", type=str,
+                        default="./demo/84_121550_000074_000000.wav", help="location of target audio file")
+    parser.add_argument("--original_transcript", type=str,
+                        default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,",
+                        help="original audio transcript")
+    parser.add_argument("--target_transcript", type=str,
+                        default="Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!",
+                        help="target audio transcript")
+    parser.add_argument("--cut_off_sec", type=float, default=3.6,
+                        help="cut off point in seconds for input prompt")
+    args = parser.parse_args()
+    return args
+
+
+args = parse_arguments()
+
+voicecraft_name = args.model_name
+# hyperparameters for inference
+codec_audio_sr = args.codec_audio_sr
+codec_sr = args.codec_sr
+top_k = args.top_k
+top_p = args.top_p  # defaults to 0.9 can also try 0.8, but 0.9 seems to work better
+temperature = args.temperature
+silence_tokens = args.silence_tokens
+kvcache = args.kvcache  # NOTE if OOM, change this to 0, or try the 330M model
+
+# NOTE adjust the below three arguments if the generation is not as good
+# NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1
+stop_repetition = args.stop_repetition
+
+# NOTE: if the if there are long silence or unnaturally strecthed words,
+# increase sample_batch_size to 4 or higher. What this will do to the model is that the
+# model will run sample_batch_size examples of the same audio, and pick the one that's the shortest.
+# So if the speech rate of the generated is too fast change it to a smaller number.
+sample_batch_size = args.sample_batch_size
+seed = args.seed  # change seed if you are still unhappy with the result
+
+# load the model
+model = voicecraft.VoiceCraft.from_pretrained(
+    f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}")
+phn2num = model.args.phn2num
+config = vars(model.args)
+model.to(device)
+
+encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
+if not os.path.exists(encodec_fn):
+    os.system(
+        f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
+    os.system(
+        f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
+# will also put the neural codec model on gpu
+audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device)
+
+text_tokenizer = TextTokenizer(backend="espeak")
+
+# Prepare your audio
+# point to the original audio whose speech you want to clone
+# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
+orig_audio = args.original_audio
+orig_transcript = args.original_transcript
+
+# move the audio and transcript to temp folder
+temp_folder = "./demo/temp"
+os.makedirs(temp_folder, exist_ok=True)
+os.system(f"cp {orig_audio} {temp_folder}")
+filename = os.path.splitext(orig_audio.split("/")[-1])[0]
+with open(f"{temp_folder}/{filename}.txt", "w") as f:
+    f.write(orig_transcript)
+# run MFA to get the alignment
+align_temp = f"{temp_folder}/mfa_alignments"
+
+os.system("source ~/.bashrc && \
+    conda activate voicecraft && \
+    mfa align -v --clean -j 1 --output_format csv {temp_folder} \
+        english_us_arpa english_us_arpa {align_temp}"
+          )
+
+# # if the above fails, it could be because the audio is too hard for the alignment model,
+# increasing the beam size usually solves the issue
+# os.system("source ~/.bashrc && \
+#     conda activate voicecraft && \
+#     mfa align -v --clean -j 1 --output_format csv {temp_folder} \
+#         english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000")
+
+# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
+cut_off_sec = args.cut_off_sec  # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio
+target_transcript = args.target_transcript
+# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.
+audio_fn = f"{temp_folder}/{filename}.wav"
+info = torchaudio.info(audio_fn)
+audio_dur = info.num_frames / info.sample_rate
+
+assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
+prompt_end_frame = int(cut_off_sec * info.sample_rate)
+
+# run the model to get the output
+
+
+def seed_everything(seed):
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+
+
+seed_everything(seed)
+
+decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache,
+                 "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
+concated_audio, gen_audio = inference_one_sample(model, argparse.Namespace(
+    **config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)
+
+# save segments for comparison
+concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
+# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
+
+# save the audio
+# output_dir
+output_dir = args.output_dir
+os.makedirs(output_dir, exist_ok=True)
+seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
+seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
+
+torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)
+torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)
+
+# you might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored

From b8bb2ab592f6146d882f16cd7aab0869aea45889 Mon Sep 17 00:00:00 2001
From: Pranay Gosar <pgosarsteam@gmail.com>
Date: Tue, 23 Apr 2024 15:25:43 -0500
Subject: [PATCH 3/7] add beam size cmd args

---
 inference_demo.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/inference_demo.py b/inference_demo.py
index bf142941..7e438ae9 100644
--- a/inference_demo.py
+++ b/inference_demo.py
@@ -50,6 +50,11 @@ def parse_arguments():
                         default=3, help="Batch size for sampling")
     parser.add_argument("--seed", type=int, default=1,
                         help="Random seed for reproducibility")
+    parser.add_argument("--beam_size", type=int, default=10,
+                        help="beam size for MFA alignment")
+    parser.add_argument("--retry_beam_size", type=int, default=40,
+                        help="retry beam size for MFA alignment")
+
     parser.add_argument("--output_dir", type=str, default="./generated_tts",
                         help="directory to save generated audio")
     parser.add_argument("--original_audio", type=str,
@@ -67,7 +72,6 @@ def parse_arguments():
 
 
 args = parse_arguments()
-
 voicecraft_name = args.model_name
 # hyperparameters for inference
 codec_audio_sr = args.codec_audio_sr
@@ -122,19 +126,15 @@ def parse_arguments():
     f.write(orig_transcript)
 # run MFA to get the alignment
 align_temp = f"{temp_folder}/mfa_alignments"
-
+beam_size = args.beam_size
+retry_beam_size = args.retry_beam_size
 os.system("source ~/.bashrc && \
     conda activate voicecraft && \
     mfa align -v --clean -j 1 --output_format csv {temp_folder} \
-        english_us_arpa english_us_arpa {align_temp}"
+        english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}"
           )
-
-# # if the above fails, it could be because the audio is too hard for the alignment model,
+# if the above fails, it could be because the audio is too hard for the alignment model,
 # increasing the beam size usually solves the issue
-# os.system("source ~/.bashrc && \
-#     conda activate voicecraft && \
-#     mfa align -v --clean -j 1 --output_format csv {temp_folder} \
-#         english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000")
 
 # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
 cut_off_sec = args.cut_off_sec  # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio

From 59877c085e49b61cb441bb0624b3fbd842da2d43 Mon Sep 17 00:00:00 2001
From: Pranay Gosar <pgosarsteam@gmail.com>
Date: Tue, 23 Apr 2024 18:38:09 -0500
Subject: [PATCH 4/7] add speech editing

---
 inference_demo.py      |  16 ++-
 speech_editing_demo.py | 220 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 225 insertions(+), 11 deletions(-)

diff --git a/inference_demo.py b/inference_demo.py
index 7e438ae9..7c9f62e5 100644
--- a/inference_demo.py
+++ b/inference_demo.py
@@ -9,7 +9,6 @@
     AudioTokenizer,
     TextTokenizer,
 )
-from IPython.display import display, Audio
 import argparse
 import random
 import numpy as np
@@ -25,7 +24,7 @@
 
 def parse_arguments():
     parser = argparse.ArgumentParser(
-        description="VoiceCraft Inference: see the script for more information on the options")
+        description="VoiceCraft TTS Inference: see the script for more information on the options")
 
     parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[
                         "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"],
@@ -34,15 +33,15 @@ def parse_arguments():
                         default=16000, help="Audio sampling rate for the codec")
     parser.add_argument("--codec_sr", type=int, default=50,
                         help="Sampling rate for the codec")
-    parser.add_argument("--top_k", type=int, default=0,
-                        help="Top-k sampling value")
+    parser.add_argument("--top_k", type=float, default=0,
+                        help="Top-k value")
     parser.add_argument("--top_p", type=float, default=0.9,
-                        help="Top-p sampling value")
+                        help="Top-p value")
     parser.add_argument("--temperature", type=float,
                         default=1.0, help="Temperature for sampling")
     parser.add_argument("--silence_tokens", type=int, nargs="*",
                         default=[1388, 1898, 131], help="Silence token IDs")
-    parser.add_argument("--kvcache", type=int, default=1,
+    parser.add_argument("--kvcache", type=int, default=1, choices=[0, 1],
                         help="Key-value cache flag (0 or 1)")
     parser.add_argument("--stop_repetition", type=int,
                         default=3, help="Stop repetition for generation")
@@ -54,7 +53,6 @@ def parse_arguments():
                         help="beam size for MFA alignment")
     parser.add_argument("--retry_beam_size", type=int, default=40,
                         help="retry beam size for MFA alignment")
-
     parser.add_argument("--output_dir", type=str, default="./generated_tts",
                         help="directory to save generated audio")
     parser.add_argument("--original_audio", type=str,
@@ -147,9 +145,6 @@ def parse_arguments():
 assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
 prompt_end_frame = int(cut_off_sec * info.sample_rate)
 
-# run the model to get the output
-
-
 def seed_everything(seed):
     os.environ['PYTHONHASHSEED'] = str(seed)
     random.seed(seed)
@@ -162,6 +157,7 @@ def seed_everything(seed):
 
 seed_everything(seed)
 
+# inference
 decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache,
                  "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
 concated_audio, gen_audio = inference_one_sample(model, argparse.Namespace(
diff --git a/speech_editing_demo.py b/speech_editing_demo.py
index 85e6ff19..220d342e 100644
--- a/speech_editing_demo.py
+++ b/speech_editing_demo.py
@@ -1 +1,219 @@
-# WIP
+"""
+This script will allow you to run Speech Editing inference with Voicecraft
+Before getting started, be sure to follow the environment setup.
+"""
+
+from inference_speech_editing_scale import inference_one_sample, get_mask_interval
+from edit_utils import get_span
+from models import voicecraft
+from data.tokenizer import (
+    AudioTokenizer,
+    TextTokenizer,
+)
+import argparse
+import random
+import numpy as np
+import torchaudio
+import torch
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["USER"] = "me"  # TODO change this to your username
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="VoiceCraft Speech Editing: see the script for more information on the options")
+
+    parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[
+                        "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"],
+                        help="VoiceCraft model to use")
+    parser.add_argument("--silence_tokens", type=int, nargs="*",
+                        default=[1388, 1898, 131], help="Silence token IDs")
+    parser.add_argument("--left_margin", type=float,
+                        default=0.08, help="Left margin value.")
+    parser.add_argument("--right_margin", type=float,
+                        default=0.08, help="Right margin value.")
+    parser.add_argument("--codec_audio_sr", type=int,
+                        default=16000, help="Codec audio sample rate.")
+    parser.add_argument("--codec_sr", type=int, default=50,
+                        help="Codec sample rate.")
+    parser.add_argument("--top_k", type=float, default=0, help="Top k value.")
+    parser.add_argument("--top_p", type=float,
+                        default=0.8, help="Top p value.")
+    parser.add_argument("--temperature", type=float,
+                        default=1, help="Temperature value.")
+    parser.add_argument("--kvcache", type=float,
+                        default=0, help="Kvcache value.")
+    parser.add_argument("--seed", type=int, default=1, help="Seed value.")
+    parser.add_argument("--beam_size", type=int, default=10,
+                        help="beam size for MFA alignment")
+    parser.add_argument("--retry_beam_size", type=int, default=40,
+                        help="retry beam size for MFA alignment")
+    parser.add_argument("--original_audio", type=str,
+                        default="./demo/84_121550_000074_000000.wav", help="location of audio file")
+    parser.add_argument("--stop_repetition", type=int,
+                        default=-1, help="Stop repetition for generation")
+    parser.add_argument("--original_transcript", type=str,
+                        default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,",
+                        help="original transcript")
+    parser.add_argument("--target_transcript", type=str,
+                        default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,",
+                        help="target transcript")
+    parser.add_argument("--edit_type", type=str,
+                        default="substitution",
+                        choices=["insertion", "substitution", "deletion"],
+                        help="type of specified edit")
+    parser.add_argument("--output_dir", type=str,
+                        default="./demo/generated_se", help="output directory")
+    args = parser.parse_args()
+    return args
+
+
+args = parse_arguments()
+
+voicecraft_name = args.model_name
+
+# hyperparameters for inference
+left_margin = args.left_margin
+right_margin = args.right_margin
+codec_audio_sr = args.codec_audio_sr
+codec_sr = args.codec_sr
+top_k = args.top_k
+top_p = args.top_p
+temperature = args.temperature
+kvcache = args.kvcache
+# NOTE: adjust the below three arguments if the generation is not as good
+seed = args.seed  # random seed magic
+silence_tokens = args.silence_tokens
+# if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1
+stop_repetition = args.stop_repetition
+# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest
+
+
+def seed_everything(seed):
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+
+
+seed_everything(seed)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth
+model = voicecraft.VoiceCraft.from_pretrained(
+    f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}")
+phn2num = model.args.phn2num
+config = vars(model.args)
+model.to(device)
+
+encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
+if not os.path.exists(encodec_fn):
+    os.system(
+        f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
+    os.system(
+        f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
+# will also put the neural codec model on gpu
+audio_tokenizer = AudioTokenizer(signature=encodec_fn)
+
+text_tokenizer = TextTokenizer(backend="espeak")
+
+# point to the original file or record the file
+# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
+orig_audio = args.original_audio
+orig_transcript = args.original_transcript
+# move the audio and transcript to temp folder
+temp_folder = "./demo/temp"
+os.makedirs(temp_folder, exist_ok=True)
+os.system(f"cp {orig_audio} {temp_folder}")
+filename = os.path.splitext(orig_audio.split("/")[-1])[0]
+with open(f"{temp_folder}/{filename}.txt", "w") as f:
+    f.write(orig_transcript)
+# run MFA to get the alignment
+align_temp = f"{temp_folder}/mfa_alignments"
+os.makedirs(align_temp, exist_ok=True)
+beam_size = args.beam_size
+retry_beam_size = args.retry_beam_size
+
+os.system("source ~/.bashrc && \
+    conda activate voicecraft && \
+    mfa align -v --clean -j 1 --output_format csv {temp_folder} \
+        english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}"
+          )
+# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue
+# os.system(f"mfa align -j 1 --clean --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000")
+audio_fn = f"{temp_folder}/{filename}.wav"
+transcript_fn = f"{temp_folder}/{filename}.txt"
+align_fn = f"{align_temp}/{filename}.csv"
+
+# propose what do you want the target modified transcript to be
+target_transcript = args.target_transcript
+edit_type = args.edit_type
+
+# if you want to do a second modification on top of the first one, write down the second modification (target_transcript2, type_of_modification2)
+# make sure the two modification do not overlap, if they do, you need to combine them into one modification
+
+# run the script to turn user input to the format that the model can take
+orig_span, new_span = get_span(orig_transcript, target_transcript, edit_type)
+if orig_span[0] > orig_span[1]:
+    RuntimeError(f"example {audio_fn} failed")
+if orig_span[0] == orig_span[1]:
+    orig_span_save = [orig_span[0]]
+else:
+    orig_span_save = orig_span
+if new_span[0] == new_span[1]:
+    new_span_save = [new_span[0]]
+else:
+    new_span_save = new_span
+
+orig_span_save = ",".join([str(item) for item in orig_span_save])
+new_span_save = ",".join([str(item) for item in new_span_save])
+
+start, end = get_mask_interval(align_fn, orig_span_save, edit_type)
+info = torchaudio.info(audio_fn)
+audio_dur = info.num_frames / info.sample_rate
+morphed_span = (max(start - left_margin, 1/codec_sr),
+                min(end + right_margin, audio_dur))  # in seconds
+
+# span in codec frames
+mask_interval = [[round(morphed_span[0]*codec_sr),
+                  round(morphed_span[1]*codec_sr)]]
+mask_interval = torch.LongTensor(mask_interval)  # [M,2], M==1 for now
+
+# run the model to get the output
+
+decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
+                 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens}
+orig_audio, new_audio = inference_one_sample(model, argparse.Namespace(
+    **config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config)
+
+# save segments for comparison
+orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu()
+# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
+
+# save the audio
+output_dir = args.output_dir
+os.makedirs(output_dir, exist_ok=True)
+
+save_fn_new = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav"
+
+torchaudio.save(save_fn_new, new_audio, codec_audio_sr)
+
+save_fn_orig = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav"
+if not os.path.isfile(save_fn_orig):
+    orig_audio, orig_sr = torchaudio.load(audio_fn)
+    if orig_sr != codec_audio_sr:
+        orig_audio = torchaudio.transforms.Resample(
+            orig_sr, codec_audio_sr)(orig_audio)
+    torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr)
+
+# # if you get error importing T5 in transformers
+# # try
+# # pip uninstall Pillow
+# # pip install Pillow
+# # you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored

From 1850da92104c264ea96b84b0cc886c10608127a7 Mon Sep 17 00:00:00 2001
From: Pranay Gosar <pgosarsteam@gmail.com>
Date: Tue, 23 Apr 2024 18:55:34 -0500
Subject: [PATCH 5/7] add short form commands

---
 inference_demo.py      | 58 ++++++++++++++++++++++--------------------
 speech_editing_demo.py | 41 ++++++++++++++---------------
 2 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/inference_demo.py b/inference_demo.py
index 7c9f62e5..86cd506c 100644
--- a/inference_demo.py
+++ b/inference_demo.py
@@ -26,44 +26,45 @@ def parse_arguments():
     parser = argparse.ArgumentParser(
         description="VoiceCraft TTS Inference: see the script for more information on the options")
 
-    parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[
+    parser.add_argument("-m", "--model_name", type=str, default="giga330M.pth", choices=[
                         "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"],
                         help="VoiceCraft model to use")
-    parser.add_argument("--codec_audio_sr", type=int,
-                        default=16000, help="Audio sampling rate for the codec")
-    parser.add_argument("--codec_sr", type=int, default=50,
-                        help="Sampling rate for the codec")
-    parser.add_argument("--top_k", type=float, default=0,
-                        help="Top-k value")
-    parser.add_argument("--top_p", type=float, default=0.9,
-                        help="Top-p value")
-    parser.add_argument("--temperature", type=float,
-                        default=1.0, help="Temperature for sampling")
-    parser.add_argument("--silence_tokens", type=int, nargs="*",
+    parser.add_argument("-st", "--silence_tokens", type=int, nargs="*",
                         default=[1388, 1898, 131], help="Silence token IDs")
-    parser.add_argument("--kvcache", type=int, default=1, choices=[0, 1],
-                        help="Key-value cache flag (0 or 1)")
-    parser.add_argument("--stop_repetition", type=int,
-                        default=3, help="Stop repetition for generation")
+    parser.add_argument("-casr", "--codec_audio_sr", type=int,
+                        default=16000, help="Codec audio sample rate.")
+    parser.add_argument("-csr", "--codec_sr", type=int, default=50,
+                        help="Codec sample rate.")
+
+    parser.add_argument("-k", "--top_k", type=float,
+                        default=0, help="Top k value.")
+    parser.add_argument("-p", "--top_p", type=float,
+                        default=0.8, help="Top p value.")
+    parser.add_argument("-t", "--temperature", type=float,
+                        default=1, help="Temperature value.")
+    parser.add_argument("-kv", "--kvcache", type=float, choices=[0, 1],
+                        default=0, help="Kvcache value.")
+    parser.add_argument("-sr", "--stop_repetition", type=int,
+                        default=-1, help="Stop repetition for generation")
     parser.add_argument("--sample_batch_size", type=int,
                         default=3, help="Batch size for sampling")
-    parser.add_argument("--seed", type=int, default=1,
-                        help="Random seed for reproducibility")
-    parser.add_argument("--beam_size", type=int, default=10,
+    parser.add_argument("-s", "--seed", type=int,
+                        default=1, help="Seed value.")
+    parser.add_argument("-bs", "--beam_size", type=int, default=10,
                         help="beam size for MFA alignment")
-    parser.add_argument("--retry_beam_size", type=int, default=40,
+    parser.add_argument("-rbs", "--retry_beam_size", type=int, default=40,
                         help="retry beam size for MFA alignment")
     parser.add_argument("--output_dir", type=str, default="./generated_tts",
                         help="directory to save generated audio")
-    parser.add_argument("--original_audio", type=str,
-                        default="./demo/84_121550_000074_000000.wav", help="location of target audio file")
-    parser.add_argument("--original_transcript", type=str,
+    parser.add_argument("-oa", "--original_audio", type=str,
+                        default="./demo/84_121550_000074_000000.wav", help="location of audio file")
+    parser.add_argument("-ot", "--original_transcript", type=str,
                         default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,",
-                        help="original audio transcript")
-    parser.add_argument("--target_transcript", type=str,
-                        default="Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!",
-                        help="target audio transcript")
-    parser.add_argument("--cut_off_sec", type=float, default=3.6,
+                        help="original transcript")
+    parser.add_argument("-tt", "--target_transcript", type=str,
+                        default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,",
+                        help="target transcript")
+    parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6,
                         help="cut off point in seconds for input prompt")
     args = parser.parse_args()
     return args
@@ -145,6 +146,7 @@ def parse_arguments():
 assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
 prompt_end_frame = int(cut_off_sec * info.sample_rate)
 
+
 def seed_everything(seed):
     os.environ['PYTHONHASHSEED'] = str(seed)
     random.seed(seed)
diff --git a/speech_editing_demo.py b/speech_editing_demo.py
index 220d342e..99c24f52 100644
--- a/speech_editing_demo.py
+++ b/speech_editing_demo.py
@@ -27,46 +27,47 @@ def parse_arguments():
     parser = argparse.ArgumentParser(
         description="VoiceCraft Speech Editing: see the script for more information on the options")
 
-    parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[
+    parser.add_argument("-m", "--model_name", type=str, default="giga330M.pth", choices=[
                         "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"],
                         help="VoiceCraft model to use")
-    parser.add_argument("--silence_tokens", type=int, nargs="*",
+    parser.add_argument("-st", "--silence_tokens", type=int, nargs="*",
                         default=[1388, 1898, 131], help="Silence token IDs")
-    parser.add_argument("--left_margin", type=float,
+    parser.add_argument("-lm", "--left_margin", type=float,
                         default=0.08, help="Left margin value.")
-    parser.add_argument("--right_margin", type=float,
+    parser.add_argument("-rm", "--right_margin", type=float,
                         default=0.08, help="Right margin value.")
-    parser.add_argument("--codec_audio_sr", type=int,
+    parser.add_argument("-casr", "--codec_audio_sr", type=int,
                         default=16000, help="Codec audio sample rate.")
-    parser.add_argument("--codec_sr", type=int, default=50,
+    parser.add_argument("-csr", "--codec_sr", type=int, default=50,
                         help="Codec sample rate.")
-    parser.add_argument("--top_k", type=float, default=0, help="Top k value.")
-    parser.add_argument("--top_p", type=float,
+    parser.add_argument("-k", "--top_k", type=float, 
+                        default=0, help="Top k value.")
+    parser.add_argument("-p", "--top_p", type=float,
                         default=0.8, help="Top p value.")
-    parser.add_argument("--temperature", type=float,
+    parser.add_argument("-t", "--temperature", type=float,
                         default=1, help="Temperature value.")
-    parser.add_argument("--kvcache", type=float,
+    parser.add_argument("-kv", "--kvcache", type=float, choices=[0, 1],
                         default=0, help="Kvcache value.")
-    parser.add_argument("--seed", type=int, default=1, help="Seed value.")
-    parser.add_argument("--beam_size", type=int, default=10,
+    parser.add_argument("-sr", "--stop_repetition", type=int,
+                        default=-1, help="Stop repetition for generation")
+    parser.add_argument("-s", "--seed", type=int, default=1, help="Seed value.")
+    parser.add_argument("-bs", "--beam_size", type=int, default=10,
                         help="beam size for MFA alignment")
-    parser.add_argument("--retry_beam_size", type=int, default=40,
+    parser.add_argument("-rbs", "--retry_beam_size", type=int, default=40,
                         help="retry beam size for MFA alignment")
-    parser.add_argument("--original_audio", type=str,
+    parser.add_argument("-oa", "--original_audio", type=str,
                         default="./demo/84_121550_000074_000000.wav", help="location of audio file")
-    parser.add_argument("--stop_repetition", type=int,
-                        default=-1, help="Stop repetition for generation")
-    parser.add_argument("--original_transcript", type=str,
+    parser.add_argument("-ot", "--original_transcript", type=str,
                         default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,",
                         help="original transcript")
-    parser.add_argument("--target_transcript", type=str,
+    parser.add_argument("-tt", "--target_transcript", type=str,
                         default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,",
                         help="target transcript")
-    parser.add_argument("--edit_type", type=str,
+    parser.add_argument("-et", "--edit_type", type=str,
                         default="substitution",
                         choices=["insertion", "substitution", "deletion"],
                         help="type of specified edit")
-    parser.add_argument("--output_dir", type=str,
+    parser.add_argument("-o", "--output_dir", type=str,
                         default="./demo/generated_se", help="output directory")
     args = parser.parse_args()
     return args

From 9fb6d948d0d86a624b815288afda92bac5fe839a Mon Sep 17 00:00:00 2001
From: pgosar <pgosarsteam@gmail.com>
Date: Tue, 23 Apr 2024 19:07:24 -0500
Subject: [PATCH 6/7] add simple running instructions

---
 README.md                        | 19 ++++++++++++++-----
 inference_demo.py => tts_demo.py |  0
 2 files changed, 14 insertions(+), 5 deletions(-)
 rename inference_demo.py => tts_demo.py (100%)

diff --git a/README.md b/README.md
index d7c6d867..129c0bff 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,8 @@ There are three ways (besides running Gradio in Colab):
 1. More flexible inference beyond Gradio UI in Google Colab. see [quickstart colab](#quickstart-colab)
 2. with docker. see [quickstart docker](#quickstart-docker)
 3. without docker. see [environment setup](#environment-setup). You can also run gradio locally if you choose this option
+4. As a standalone script that you can easily integrate into other projects.
+see [quickstart command line](#quickstart-command-line).
 
 When you are inside the docker image or you have installed all dependencies, Checkout [`inference_tts.ipynb`](./inference_tts.ipynb).
 
@@ -21,7 +23,7 @@ If you want to do model development such as training/finetuning, I recommend fol
 ## News
 :star: 04/22/2024: 330M/830M TTS Enhanced Models are up [here](https://huggingface.co/pyp1), load them through [`gradio_app.py`](./gradio_app.py) or [`inference_tts.ipynb`](./inference_tts.ipynb)! Replicate demo is up, major thanks to [@chenxwh](https://github.com/chenxwh)!
 
-:star: 04/11/2024: VoiceCraft Gradio is now available on HuggingFace Spaces [here](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio)! Major thanks to [@zuev-stepan](https://github.com/zuev-stepan), [@Sewlell](https://github.com/Sewlell), [@pgsoar](https://github.com/pgosar) [@Ph0rk0z](https://github.com/Ph0rk0z). 
+:star: 04/11/2024: VoiceCraft Gradio is now available on HuggingFace Spaces [here](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio)! Major thanks to [@zuev-stepan](https://github.com/zuev-stepan), [@Sewlell](https://github.com/Sewlell), [@pgsoar](https://github.com/pgosar) [@Ph0rk0z](https://github.com/Ph0rk0z).
 
 :star: 04/05/2024: I finetuned giga330M with the TTS objective on gigaspeech and 1/5 of librilight. Weights are [here](https://huggingface.co/pyp1/VoiceCraft/tree/main). Make sure maximal prompt + generation length <= 16 seconds (due to our limited compute, we had to drop utterances longer than 16s in training data). Even stronger models forthcomming, stay tuned!
 
@@ -37,11 +39,9 @@ If you want to do model development such as training/finetuning, I recommend fol
 - [x] Better guidance on training/finetuning
 - [x] Colab notebooks
 - [x] HuggingFace Spaces demo
-- [ ] Command line
+- [x] Command line
 - [ ] Improve efficiency
 
-
-
 ## QuickStart Colab
 
 :star: To try out speech editing or TTS Inference with VoiceCraft, the simplest way is using Google Colab.
@@ -50,6 +50,15 @@ Instructions to run are on the Colab itself.
 1. To try [Speech Editing](https://colab.research.google.com/drive/1FV7EC36dl8UioePY1xXijXTMl7X47kR_?usp=sharing)
 2. To try [TTS Inference](https://colab.research.google.com/drive/1lch_6it5-JpXgAQlUTRRI2z2_rk5K67Z?usp=sharing)
 
+## QuickStart Command Line
+
+:star: To use it as a standalone script, check out tts_demo.py and speech_editing_demo.py.
+Be sure to first [setup your environment](#environment-setup).
+Without arguments, they will run the standard demo arguments used as an example elsewhere
+in this repository. You can use the command line arguments to specify unique input audios,
+target transcripts, and inference hyperparameters. Run the help command for more information:
+`python3 tts_demo.py -h` and `python3 speech-editing_demo.py -h`
+
 ## QuickStart Docker
 :star: To try out TTS inference with VoiceCraft, you can also use docker. Thank [@ubergarm](https://github.com/ubergarm) and [@jayc88](https://github.com/jay-c88) for making this happen.
 
@@ -197,7 +206,7 @@ cd ./z_scripts
 bash e830M.sh
 ```
 
-It's the same procedure to prepare your own custom dataset. Make sure that if 
+It's the same procedure to prepare your own custom dataset. Make sure that if
 
 ## Finetuning
 You also need to do step 1-4 as Training, and I recommend to use AdamW for optimization if you finetune a pretrained model for better stability. checkout script `./z_scripts/e830M_ft.sh`.
diff --git a/inference_demo.py b/tts_demo.py
similarity index 100%
rename from inference_demo.py
rename to tts_demo.py

From 1a896d21fe2866cc99707140c2b533d0eef2c7ce Mon Sep 17 00:00:00 2001
From: Pranay Gosar <pgosarsteam@gmail.com>
Date: Fri, 3 May 2024 22:16:06 -0500
Subject: [PATCH 7/7] adjust cut off sec and target transcript

---
 tts_demo.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/tts_demo.py b/tts_demo.py
index 86cd506c..6f8c3478 100644
--- a/tts_demo.py
+++ b/tts_demo.py
@@ -62,10 +62,13 @@ def parse_arguments():
                         default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,",
                         help="original transcript")
     parser.add_argument("-tt", "--target_transcript", type=str,
-                        default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,",
+                        default="object was seen as a mirage in the lake in the distance,",
                         help="target transcript")
     parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6,
                         help="cut off point in seconds for input prompt")
+    parser.add_argument("-ma", "--margin", type=float, default=0.07,
+                    help="lowest margin in seconds between words for input prompt")
+
     args = parser.parse_args()
     return args
 
@@ -135,11 +138,33 @@ def parse_arguments():
 # if the above fails, it could be because the audio is too hard for the alignment model,
 # increasing the beam size usually solves the issue
 
+def find_closest_word_boundary(alignments, cut_off_sec, margin):
+    with open(alignments, 'r') as file:
+        # skip header
+        next(file)
+        prev_end = 0.0
+        cutoff_time = None
+        cutoff_index = None
+        for i, line in enumerate(file):
+            end = float(line.strip().split(',')[1])
+            if end >= cut_off_sec and end - prev_end >= margin:
+                cutoff_time = end + margin / 2
+                cutoff_index = i
+                break
+
+            prev_end = end
+        
+        return cutoff_time, cutoff_index
+
 # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
-cut_off_sec = args.cut_off_sec  # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio
-target_transcript = args.target_transcript
-# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.
+# NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio
+cut_off_sec = args.cut_off_sec
+margin = args.margin
 audio_fn = f"{temp_folder}/{filename}.wav"
+alignments = f"{temp_folder}/mfa_alignments/{filename}.csv"
+cut_off_sec, cut_off_word_idx = find_closest_word_boundary(alignments, cut_off_sec, margin)
+target_transcript = " ".join(orig_transcript.split(" ")[:cut_off_word_idx]) + " " + args.target_transcript
+# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.
 info = torchaudio.info(audio_fn)
 audio_dur = info.num_frames / info.sample_rate