Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions everyvoice/base_cli/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,17 @@ def summarize_hfgl_generator_model(model_path: Path, checkpoint: dict) -> None:
print(summary(vocoder_model, None, verbose=0))


def summarize_styletts2_model(model_path: Path, checkpoint: dict) -> None:
from torchinfo import summary

from everyvoice.model.e2e.StyleTTS2_lightning.styletts2.lightning import (
StyleTTS2Module,
)

model = StyleTTS2Module.load_from_checkpoint(model_path)
print(summary(model, None, verbose=0))


def summarize_unknown_model(model_path: Path, checkpoint: dict) -> None:
from tabulate import tabulate

Expand Down Expand Up @@ -194,6 +205,7 @@ def inspect(

if show_architecture:
checkpoint = load_checkpoint(model_path, minimal=False)

if "model_info" in checkpoint:
print(
"Inspecting checkpoint according to its model info:",
Expand All @@ -203,6 +215,7 @@ def inspect(
"FastSpeech2": summarize_fs2_model,
"HiFiGAN": summarize_hfgl_model,
"HiFiGANGenerator": summarize_hfgl_generator_model,
"StyleTTS2Module": summarize_styletts2_model,
}
summarizer = model_summarizers.get(checkpoint["model_info"]["name"], None)
if summarizer:
Expand Down
41 changes: 41 additions & 0 deletions everyvoice/base_cli/prediction_writing_callback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Generic base for synthesis-output-writing Lightning callbacks.

Shared by the FS2 and StyleTTS2 prediction-writing callback hierarchies.
Subclasses override ``on_predict_batch_end`` with format-specific logic.
"""

from __future__ import annotations

from pathlib import Path

from pytorch_lightning.callbacks import Callback


class BasePredictionWritingCallback(Callback):
"""Handles output-directory creation and output-path construction.

Concrete subclasses must implement ``on_predict_batch_end``.
"""

def __init__(
self,
save_dir: Path,
file_extension: str,
global_step: int,
include_global_step_in_filename: bool = False,
) -> None:
super().__init__()
self.file_extension = file_extension
self.global_step = f"ckpt={global_step}"
self.save_dir = save_dir
self.sep = "--"
self.include_global_step_in_filename = include_global_step_in_filename
self.save_dir.mkdir(parents=True, exist_ok=True)

def get_filename(self, basename: str, speaker: str, language: str) -> str:
name_parts = [basename, speaker, language, self.file_extension]
if self.include_global_step_in_filename:
name_parts.insert(-1, self.global_step)
path = self.save_dir / self.sep.join(name_parts)
path.parent.mkdir(parents=True, exist_ok=True)
return str(path)
215 changes: 209 additions & 6 deletions everyvoice/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
from everyvoice.model.aligner.wav2vec2aligner.aligner.cli import (
extract_segments_from_textgrid,
)
from everyvoice.model.e2e.StyleTTS2_lightning.styletts2.cli.synthesize import (
synthesize as synthesize_styletts2,
)
from everyvoice.model.e2e.StyleTTS2_lightning.styletts2.cli.train import (
train as train_styletts2,
)
Expand Down Expand Up @@ -571,9 +574,12 @@ def new_project(
help="""
# Synthesize Help

- **from-text** --- This is the most common input for performing normal speech synthesis. It will take text or a filelist with text and produce either waveform audio or spectrogram.
- **from-text** --- This is the most common input for performing normal speech synthesis. It will take text or a filelist with text and produce either waveform audio or spectrogram. This option uses FastSpeech2 & HiFiGAN. If you want to do end-to-end synthesis with StyleTTS2, run `everyvoice synthesize text-to-wav` instead.

- **text-to-wav** --- Synthesize audio directly from text using a trained end-to-end (StyleTTS2) model. Only supports the wav output format.

- **from-spec** --- This is the model that turns your spectral features into audio. This type of synthesis is also known as copy synthesis and unless you know what you are doing, you probably don't want to do this.

""",
)

Expand All @@ -585,6 +591,11 @@ def new_project(
name="from-spec",
)(synthesize_hfg)

synthesize_group.command(
name="text-to-wav",
short_help="Synthesize audio from text using a trained StyleTTS2 model",
)(synthesize_styletts2)

app.add_typer(
synthesize_group,
name="synthesize",
Expand Down Expand Up @@ -669,9 +680,29 @@ def test(suite: TestSuites = typer.Argument("dev")): # pragma: no cover
)


@app.command()
# Add the demo commands
demo_group = typer.Typer(
pretty_exceptions_show_locals=False,
no_args_is_help=True,
context_settings={"help_option_names": ["-h", "--help"]},
rich_markup_mode="markdown",
cls=TyperGroupOrderAsDeclared,
help="""
# Demo Help

- **text-to-spec** --- Launch an interactive Gradio demo for a two-stage (FastSpeech2 + HiFiGAN) model.

- **text-to-wav** --- Launch an interactive Gradio demo for an end-to-end (StyleTTS2) model.
""",
)


@demo_group.command(
name="text-to-spec",
short_help="Launch a Gradio demo for a text-to-spec (FastSpeech2 + HiFiGAN) model",
)
@merge_args(inference_base_command_interface)
def demo(
def demo_text_to_spec(
text_to_spec_model: Annotated[
Path,
typer_file_argument(
Expand Down Expand Up @@ -699,19 +730,19 @@ def demo(
["all"],
"--language",
"-l",
help="Specify languages to be included in the demo. Must be supported by your model. Example: everyvoice demo TEXT_TO_SPEC_MODEL SPEC_TO_WAV_MODEL --language eng --language fin",
help="Specify languages to be included in the demo. Must be supported by your model. Example: everyvoice demo text-to-spec TEXT_TO_SPEC_MODEL SPEC_TO_WAV_MODEL --language eng --language fin",
),
speakers: list[str] = typer.Option(
["all"],
"--speaker",
"-s",
help="Specify speakers to be included in the demo. Must be supported by your model. Example: everyvoice demo TEXT_TO_SPEC_MODEL SPEC_TO_WAV_MODEL --speaker speaker_1 --speaker Sue",
help="Specify speakers to be included in the demo. Must be supported by your model. Example: everyvoice demo text-to-spec TEXT_TO_SPEC_MODEL SPEC_TO_WAV_MODEL --speaker speaker_1 --speaker Sue",
),
outputs: list[AllowedDemoOutputFormats] = typer.Option(
["all"],
"--output-format",
"-O",
help="Specify output formats to be included in the demo. Example: everyvoice demo TEXT_TO_SPEC_MODEL SPEC_TO_WAV_MODEL --output-format wav --output-format readalong-html",
help="Specify output formats to be included in the demo. Example: everyvoice demo text-to-spec TEXT_TO_SPEC_MODEL SPEC_TO_WAV_MODEL --output-format wav --output-format readalong-html",
),
output_dir: Path = typer_directory_option(
"synthesis_output",
Expand Down Expand Up @@ -764,6 +795,7 @@ def demo(
] = None,
**kwargs,
):
"""Launch an interactive Gradio demo for a two-stage (FastSpeech2 + HiFiGAN) model."""
if allowlist and denylist:
raise typer.BadParameter(
"You provided a value for both the allowlist and the denylist but you can only provide one."
Expand Down Expand Up @@ -835,6 +867,177 @@ def demo(
)


@demo_group.command(
name="text-to-wav",
short_help="Launch a Gradio demo for an end-to-end (StyleTTS2) model",
)
def demo_text_to_wav(
model_path: Annotated[
Path,
typer_file_argument(help="The path to a trained StyleTTS2 checkpoint (.ckpt)."),
],
reference: Optional[Path] = typer.Option(
None,
"--reference",
"-r",
help="Path to a reference audio file that sets the default speaker style in the UI. "
"Use --speaker for a named multi-speaker dropdown.",
exists=True,
),
speaker: list[str] = typer.Option(
[],
"--speaker",
"-s",
help="Named speaker defined as 'Display Name=path/to/audio.wav'. "
"Repeat the flag to add multiple speakers. "
"Their style encodings are pre-computed at startup and shown in a dropdown. "
"Example: everyvoice demo text-to-wav CONFIG MODEL --speaker 'Alice=alice.wav' --speaker 'Bob=bob.wav'",
),
allowlist: Annotated[
Optional[Path],
typer_file_option(
"--allowlist",
help="A plain text file containing a list of words or utterances to allow synthesizing. "
"Words/utterances should be separated by a new line in a plain text file. "
"All other words are disallowed.",
),
] = None,
denylist: Annotated[
Optional[Path],
typer_file_option(
"--denylist",
help="A plain text file containing a list of words or utterances to disallow synthesizing. "
"Words/utterances should be separated by a new line in a plain text file. "
"All other words are allowed. "
"IMPORTANT: there are many ways to 'hack' the denylist that we do not protect against. "
"We suggest using the 'allowlist' instead for maximum security if you know the full list "
"of utterances you want to allow synthesis for.",
),
] = None,
output_dir: Path = typer_directory_option(
"synthesis_output",
"--output-dir",
"-o",
exists=False,
help="The directory where your synthesized audio should be written.",
),
accelerator: str = typer.Option(
"auto",
"--accelerator",
"-a",
help="Specify the Pytorch Lightning accelerator to use.",
),
port: int = typer.Option(7860, "--port", "-p", help="The port to run the demo on."),
share: bool = typer.Option(
False,
"--share",
help="Share the demo using Gradio's share feature. This will make the demo accessible from the internet.",
),
server_name: str = typer.Option(
"0.0.0.0",
"--server-name",
"-n",
help="The server name to run the demo on. This is useful if you want to run the demo on a specific IP address.",
),
):
"""Launch an interactive Gradio demo for an end-to-end (StyleTTS2) model."""
if not speaker and reference is None:
raise typer.BadParameter(
"Provide at least one --speaker 'Name=path/to/audio.wav' or a --reference path.",
param_hint="--speaker / --reference",
)
if allowlist and denylist:
raise typer.BadParameter(
"You provided a value for both the allowlist and the denylist but you can only provide one."
)

# Parse --speaker "Display Name=path/to/audio.wav" entries
speakers_dict: dict[str, Path] = {}
for s in speaker:
if "=" not in s:
raise typer.BadParameter(
f"Speaker '{s}' must be in the format 'Display Name=path/to/audio.wav'.",
param_hint="--speaker",
)
display_name, path_str = s.split("=", 1)
audio_path = Path(path_str.strip()).expanduser()
if not audio_path.exists():
raise typer.BadParameter(
f"Speaker audio file not found: {audio_path}",
param_hint="--speaker",
)
speakers_dict[display_name.strip()] = audio_path

# --reference with no --speaker → reference-upload mode (no speaker dropdown)
default_reference = reference if not speakers_dict else None

allowlist_data: list[str] = []
denylist_data: list[str] = []
if allowlist:
with open(allowlist) as f:
allowlist_data = [line.strip() for line in f if line.strip()]
if denylist:
with open(denylist) as f:
denylist_data = [line.strip() for line in f if line.strip()]

import json

import torch

print("INFO - Starting the StyleTTS2 demo with the following parameters:")
print(f" - Model Path: {model_path}")
try:
_state = torch.load(model_path, map_location="cpu", weights_only=False)
_hp = _state.get("hyper_parameters", {})
print(f" - Mode: {_hp.get('mode', 'unknown')} (from checkpoint)")
if "config" in _hp:
print(" - Checkpoint config:")
print(json.dumps(_hp["config"], indent=4, default=str))
del _state
except Exception as e:
print(f" - (Could not read checkpoint config: {e})")
if speakers_dict:
for name, path in speakers_dict.items():
print(f" - Speaker: {name} = {path}")
else:
print(f" - Reference: {reference}")
print(f" - Allowlist: {allowlist if allowlist else 'None'}")
print(f" - Denylist: {denylist if denylist else 'None'}")
print(f" - Output Dir: {output_dir}")
print(f" - Accelerator: {accelerator}")
print(f" - Port: {port}")
print(f" - Share: {share}")
print(f" - Server Name: {server_name}")

with spinner("Loading software"):
from everyvoice.demo.app import create_demo_app_styletts2

with spinner("Loading model"):
demo = create_demo_app_styletts2(
model_path=model_path,
output_dir=output_dir,
speakers=speakers_dict,
default_reference=default_reference,
accelerator=accelerator,
allowlist=allowlist_data,
denylist=denylist_data,
)

demo.launch(
share=share,
server_port=port,
server_name=server_name,
allowed_paths=[str(output_dir), tempfile.gettempdir()],
)


app.add_typer(
demo_group,
name="demo",
short_help="Launch an interactive Gradio demo for your EveryVoice models",
)


@app.command(hidden=True)
def update_schemas(
out_dir: Annotated[
Expand Down
Loading
Loading