jasonppy · fakerybakery · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,39 +1,114 @@
-__pycache__/
-*.py[cod]
 *$py.class
+*.cover
+*.egg
 *.egg-info
-.pytest_cache
-.ipynb_checkpoints
-
-thumbs.db
-.DS_Store
-.idea
+*.egg-info/
+*.flac
+*.gz
 *.log
-*rtx*
-*.pdf
+*.manifest
 *.mkv
+*.mo
+*.mp3
 *.mp4
-*a40*
-*durip*
+*.pdf
 *.png
-sim_lr.ipynb
-*.mp3
-*.gz
-*.flac
-*.th
-*.pth
+*.pot
 *.pt
-local_*
+*.pth
+*.py,cover
+*.py[cod]
+*.sage.py
+*.so
+*.spec
+*.th
+*a40*
+*durip*
+*rtx*
+.cache
+.coverage
+.coverage.*
+.dmypy.json
+.DS_Store
+.eggs/
+.env
+.hypothesis/
+.idea
+.installed.cfg
+.ipynb_checkpoints
+.mypy_cache/
+.nox/
+.pdm-build/
+.pdm-python
+.pdm.toml
+.pybuilder/
+.pypirc
+.pyre/
+.pytest_cache
+.pytest_cache/
+.Python
+.pytype/
+.ropeproject
+.ruff_cache/
+.scrapy
+.spyderproject
+.spyproject
+.tox/
+.venv
+.webassets-cache
+/site
+amt/
+bad_files/
+build/
+celerybeat-schedule
+celerybeat.pid
+cover/
+coverage.xml
+cython_debug/
+db.sqlite3
+db.sqlite3-journal
+demo/generated_tts
+develop-eggs/
+dist/
+dmypy.json
+docs/_build/
+downloads/
+eggs/
+env.bak/
+env/
+ENV/
+file_log.txt
+file_log_debug*.txt
+htmlcov/
 hub/
+instance/
+ipython_config.py
+lib/
+lib64/
+local_*
+local_settings.py
+MANIFEST
+nosetests.xml
+parts/
 per_sample_res/
-src/
-res/seed_tts_eval/
+pip-delete-this-directory.txt
+pip-log.txt
+profile_default/
 res/lspc_eval
-file_log.txt
-file_log_debug*.txt
-bad_files/
-amt/
+res/seed_tts_eval/
 sam1.wav
 sam2.wav
 sam3.wav
-demo/generated_tts
+sdist/
+share/python-wheels/
+sim_lr.ipynb
+src/
+target/
+thumbs.db
+var/
+venv.bak/
+venv/
+wheels/
+__pycache__/
+__pypackages__/
+generated_tts/
diff --git a/MODEL-LICENSE → LICENSE-MODEL b/MODEL-LICENSE → LICENSE-MODEL
diff --git a/README.md b/README.md
@@ -1,79 +1,71 @@
-# VoiceStar: Robust, Duration-controllable TTS that can Extrapolate
+# VoiceStar: Robust, Duration-Controllable TTS that can Extrapolate
 
-## TODO
-- [x] Gradio demo ETA: 6 April 2025
-- [ ] Research Paper: 7 April 2025 - 14 April 2025
+VoiceStar is a robust, duration-controllable TTS model with support for test-time extrapolation, meaning it can generate speech longer than the duration it was trained on.
+
+## Features
+
+- **Duration control**: Specify the duration of the generated speech.
+- **Zero-shot voice cloning**: Clone any voice with a short reference audio clip ([demo video](https://x.com/PuyuanPeng/status/1908822618167300419)).
+
+Coming soon: research paper (ETA: 7 April 2025 - 14 April 2025)
+
+## Quick Start
+
+### Install
 
-## 1. Env setup
-### Download model
-```bash
-# under VoiceStar root dir
-wget -O ./pretrained/encodec_6f79c6a8.th https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th?download=true
-wget -O ./pretrained/VoiceStar_840M_30s.pth https://huggingface.co/pyp1/VoiceStar/resolve/main/VoiceStar_840M_30s.pth?download=true
-wget -O ./pretrained/VoiceStar_840M_40s.pth https://huggingface.co/pyp1/VoiceStar/resolve/main/VoiceStar_840M_40s.pth?download=true
-```
-### Inference only:
 ```bash
-conda create -n voicestar python=3.10
-conda activate voicestar # this seems to lead to much worse results in terms of wer and spksim (comparing e9_rerun and e9_rerun_newba_upgraded)
-pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124 
-pip install numpy, tqdm, fire
-pip install phonemizer==3.2.1
-apt-get install espeak-ng # backend for the phonemizer
-pip install torchmetrics
-pip install einops
-pip install omegaconf==2.3.0
-pip install openai-whisper
-pip install gradio
+pip install voicestar
 ```
 
-* avoid warnings likes
-[WARNING] words_mismatch.py:88 || words count mismatch on 200.0% of the lines (2/1)
-```python
-# go to ~/miniconda3/envs/voicestar/lib/python3.10/site-packages/phonemizer/backend/espeak/words_mismatch.py
-# pass the warning like this
-    def _resume(self, nmismatch: int, nlines: int):
-        """Logs a high level undetailed warning"""
-        pass
-        # if nmismatch:
-        #     self._logger.warning(
-        #         'words count mismatch on %s%% of the lines (%s/%s)',
-        #         round(nmismatch / nlines, 2) * 100, nmismatch, nlines)
-```
+Make sure you also have `espeak-ng` installed.
+
+**Note:** If you run into issues installing VoiceStar with `uv`, try installing it with `pip` instead.
+
+### Usage
+
+Basic usage:
 
-### Training and data processing
-*additional packages*:
 ```bash
-pip install huggingface_hub
-pip install datasets
-pip install tensorboard
-pip install wandb
-pip install matplotlib
-pip install ffmpeg-python
-pip install scipy
-pip install soundfile
+voicestar --reference-speech "./demo/5895_34622_000026_000002.wav" --target-text "I cannot believe that the same model can also do text to speech synthesis too! And you know what? this audio is 8 seconds long." --target-duration 8
 ```
 
-## 2. example 
-### command line example
-check signature of `run_inference` func in `inference_commandline.py` for adjustable hyperparameters
+Please refer to the CLI and Python API documentation below for more advanced usage.
+
+_Note: Both CUDA, CPU, and MPS (Apple Silicon) are supported._
+
+## Training
+
+Please refer to the [training docs](docs/training.md) for more information.
+
+## Inference
+
+### CLI
+
 ```bash
-# under root dir
-conda activate voicestar
-python inference_commandline.py \
-  --reference_speech "./demo/5895_34622_000026_000002.wav" \
-  --target_text "I cannot believe that the same model can also do text to speech synthesis too! And you know what? this audio is 8 seconds long." \
-  --target_duration 8
+voicestar --reference-speech "./demo/5895_34622_000026_000002.wav" --target-text "I cannot believe that the same model can also do text to speech synthesis too!"
 ```
 
-### Gradio
+View all available options:
+
 ```bash
-conda activate voicestar
-python inference_gradio.py
+voicestar --help
 ```
 
+### Python API
+
+```python
+from voicestar import VoiceStar
+
+# Initialize the model
+model = VoiceStar()
+
+# Generate speech from text
+audio = model.generate("I cannot believe that the same model can also do text to speech synthesis too!")
+audio.save("output.wav")
+```
 
 ## License
-Code license: MIT
 
-Model Weights License: CC-BY-4.0 (as Emilia dataset we used is under this license)
+The code in this repo is licensed under the MIT license. The pretrained model weights available on Hugging Face are licensed under the CC-BY-4.0 license.
+
+This repository may contain third-party software which may be licensed under different licenses.
diff --git a/app.py b/app.py
@@ -0,0 +1,96 @@
+import gradio as gr
+import torch
+import os
+from voicestar import VoiceStar
+from voicestar.utils import seed_everything
+from txtsplit import txtsplit
+import numpy as np
+
+ABOUT = """
+# VoiceStar TTS
+
+Gradio demo for [VoiceStar](https://github.com/jasonppy/VoiceStar): robust, duration-controllable TTS that can extrapolate.
+"""
+
+# Initialize model once outside the function for better performance
+model = VoiceStar()
+
+
+def generate_audio(
+    reference_speech,
+    text,
+    duration=0.0,
+    top_k=10,
+    temperature=1.0,
+    repeat_prompt=1,
+    seed=1,
+    progress=gr.Progress(),
+):
+    # Set seed for reproducibility if provided
+    if seed > 0:
+        seed_everything(seed)
+
+    # Update model parameters if needed
+    model.api.top_k = top_k
+    model.api.temperature = temperature
+    model.repeat_prompt = repeat_prompt
+
+    # Generate speech
+    target_duration = None if duration <= 0 else duration
+    texts = txtsplit(text)
+
+    audios = []
+    for t in progress.tqdm(texts, desc=f"Generating audio in {len(texts)} chunks"):
+        audio = model.generate(
+            reference_speech=reference_speech, text=t, target_duration=target_duration
+        )
+        audios.append(audio.waveform.squeeze().numpy())
+
+    audio = np.concatenate(audios)
+
+    # Return audio for gradio
+    return (16000, audio)
+
+
+with gr.Blocks() as demo:
+    gr.Markdown(ABOUT)
+    inp_ref = gr.Audio(label="Reference Audio", type="filepath")
+    inp_text = gr.Textbox(label="Text to synthesize")
+
+    with gr.Accordion("Advanced Settings", open=False):
+        inp_reference_text = gr.Textbox(
+            label="Reference Text",
+            info="Enter a transcription of the reference audio. This is optional - if not provided, the model will transcribe the audio automatically.",
+        )
+        inp_duration = gr.Number(
+            label="Duration",
+            info="Set to 0 to automatically estimate duration",
+            value=0.0,
+        )
+        inp_top_k = gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=10)
+        inp_temp = gr.Slider(
+            label="Temperature", minimum=0.0, maximum=2.0, step=0.01, value=1.0
+        )
+        inp_repeat_prompt = gr.Slider(
+            label="Repeat prompt", minimum=1, maximum=10, step=1, value=1
+        )
+        inp_seed = gr.Number(label="Seed", info="Set to 0 to use random seed", value=1)
+
+    btn_generate = gr.Button("Generate", variant="primary")
+    out_audio = gr.Audio(label="Generated Audio")
+
+    btn_generate.click(
+        fn=generate_audio,
+        inputs=[
+            inp_ref,
+            inp_text,
+            inp_duration,
+            inp_top_k,
+            inp_temp,
+            inp_repeat_prompt,
+            inp_seed,
+        ],
+        outputs=[out_audio],
+    )
+
+demo.queue().launch()