From a223501f68ae0e7cc61b7d2d3c36a1598bf80acf Mon Sep 17 00:00:00 2001 From: leweex95 Date: Wed, 25 Feb 2026 15:21:52 +0100 Subject: [PATCH 01/13] various staiblity fixes on cpu fro chatterbox and qwen3 --- docs/cpu_benchmark_report.md | 41 +++++++++ pyproject.toml | 2 +- src/voicegenhub/__init__.py | 10 ++- src/voicegenhub/providers/bark.py | 19 ++--- src/voicegenhub/providers/chatterbox.py | 7 ++ src/voicegenhub/providers/factory.py | 24 +++--- src/voicegenhub/providers/qwen.py | 7 ++ src/voicegenhub/utils/compatibility.py | 74 ++++++++++++++++ tests/conftest.py | 34 ++++++-- tests/integration/test_provider_load.py | 107 ++++++++++++++++++++++++ 10 files changed, 292 insertions(+), 33 deletions(-) create mode 100644 docs/cpu_benchmark_report.md create mode 100644 src/voicegenhub/utils/compatibility.py create mode 100644 tests/integration/test_provider_load.py diff --git a/docs/cpu_benchmark_report.md b/docs/cpu_benchmark_report.md new file mode 100644 index 0000000..c327ed9 --- /dev/null +++ b/docs/cpu_benchmark_report.md @@ -0,0 +1,41 @@ +# CPU Generation Comparison Report: Qwen 3 vs. Chatterbox + +This report compares the locally hosted performance of **Qwen 3 TTS** and **Chatterbox TTS** when running exclusively on **CPU**. + +## Benchmark Methodology +- **Hardware:** Windows CPU (Local) +- **Environment:** Python 3.13, PyTorch 2.10.0 (Manual version, no flash-attn) +- **Input Phrases:** + 1. "Warm up." (Short) + 2. "The quick brown fox jumps over the lazy dog." (44 chars) +- **Metrics:** Initialization time, Generation time, Real-Time Factor (RTF). + +## Results Summary + +| Metric | Qwen 3 TTS (CPU) | Chatterbox TTS (CPU) | +|--------|------------------|----------------------| +| **Model Load/Init** | 25.04s | 37.42s | +| **Warm-up (2 words)** | ~464.0s (7.7 min) | ~80.0s (1.3 min) | +| **Text 1 (44 chars)** | > 720s (Timed out/Interrupted) | 98.71s | +| **Audio Duration** | ~1.5s (est) | 2.44s | +| **RTF (Estimated)** | **~300+** | **40.44** | + +## Key Findings + +### 1. Performance Gap +**Chatterbox is approximately 7-8 times faster than Qwen 3 on CPU.** While both are significantly slower than real-time (RTF > 1.0), Chatterbox is at least usable for very short snippets if patience is high, whereas Qwen 3 is unfeasibly slow for interactive use without GPU acceleration. + +### 2. Qwen 3 Constraints +Qwen 3's performance suffers heavily on CPU due to: +- **Missing `flash-attn`:** The lack of flash attention forces a manual PyTorch implementation which is not optimized for CPU instruction sets. +- **Large Transformer Architecture:** Even the 0.6B version of Qwen 3 (CustomVoice) is heavy for single-threaded or unoptimized CPU inference. +- **RTF > 300:** Generating 1 second of audio takes over 5 minutes. + +### 3. Chatterbox Advantages +Chatterbox performs better on CPU because: +- **Optimization:** The provider includes specific patches for CPU compatibility (float32 normalization, S3Tokenizer patches, etc.). +- **Architecture:** While still a transformer-based model (T3), it seems better suited for local CPU execution than the current Qwen 3 implementation in this environment. +- **RTF ~40:** Generating 1 second of audio takes about 40 seconds. + +## Conclusion +If running locally on CPU is a requirement, **Chatterbox** is the only viable option between the two, though it remains quite slow. **Qwen 3** is essentially unusable on CPU without significant optimization or quantization (which was not evaluated here). For production CPU-only use, lighter providers (like Edge TTS or Kokoro) are recommended over these two heavier models. diff --git a/pyproject.toml b/pyproject.toml index 7944470..8f3f5b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "voicegenhub" -version = "1.1.4" +version = "1.1.5" description = "Simple Text-to-Speech library supporting multiple providers" authors = ["leweex95 "] readme = "README.md" diff --git a/src/voicegenhub/__init__.py b/src/voicegenhub/__init__.py index d3ffc6c..534e4fc 100644 --- a/src/voicegenhub/__init__.py +++ b/src/voicegenhub/__init__.py @@ -16,7 +16,7 @@ audio = await tts.generate("Hello, world!", voice="en-US-AriaNeural") """ -__version__ = "0.1.0" +__version__ = "1.1.4" __author__ = "leweex95" __email__ = "csibi.levente14@gmail.com" @@ -24,6 +24,14 @@ # Set attention implementation to eager before any imports to prevent SDPA warnings os.environ['TRANSFORMERS_ATTENTION_IMPLEMENTATION'] = 'eager' +# Apply CPU compatibility patches early to prevent import-time crashes in dependencies +try: + from .utils.compatibility import apply_cpu_compatibility_patches + apply_cpu_compatibility_patches() +except Exception: + # Fail silently to avoid breaking the whole library if the tool itself has issues + pass + # Import core classes when available try: from .config.settings import Settings # noqa: F401 diff --git a/src/voicegenhub/providers/bark.py b/src/voicegenhub/providers/bark.py index 0dc47f2..a04461f 100644 --- a/src/voicegenhub/providers/bark.py +++ b/src/voicegenhub/providers/bark.py @@ -61,18 +61,15 @@ def display_name(self) -> str: return "Bark (Suno)" async def initialize(self) -> None: - """Initialize the Bark TTS provider. - - Bark loads three separate neural network models for speech synthesis: - 1. Text encoder (converts text to embeddings): ~312M params - 2. Coarse acoustic model (predicts coarse acoustic tokens): ~314M params - 3. Fine acoustic model (predicts fine acoustic tokens for quality): ~302M params - - This multi-model architecture allows Bark to synthesize natural, expressive speech. - The models are loaded once on initialization and reused for all subsequent inference calls. - Loading may take time on first run, but subsequent calls are much faster. - """ + """Initialize the Bark TTS provider.""" try: + # Apply CPU compatibility patches + try: + from ..utils.compatibility import apply_cpu_compatibility_patches + apply_cpu_compatibility_patches() + except ImportError: + pass + import torch import torch.serialization import warnings diff --git a/src/voicegenhub/providers/chatterbox.py b/src/voicegenhub/providers/chatterbox.py index d604a04..29c2678 100644 --- a/src/voicegenhub/providers/chatterbox.py +++ b/src/voicegenhub/providers/chatterbox.py @@ -255,6 +255,13 @@ async def initialize(self): try: logger.info("Initializing Chatterbox TTS provider...") + # Apply CPU compatibility patches specifically if not already done + try: + from ..utils.compatibility import apply_cpu_compatibility_patches + apply_cpu_compatibility_patches() + except ImportError: + pass + # Safety check for Perth watermarker (often fails if setuptools/pkg_resources is missing) try: import perth diff --git a/src/voicegenhub/providers/factory.py b/src/voicegenhub/providers/factory.py index 8390a9a..e1c433e 100644 --- a/src/voicegenhub/providers/factory.py +++ b/src/voicegenhub/providers/factory.py @@ -25,38 +25,38 @@ async def discover_provider(self, provider_id: str) -> None: try: from .edge import EdgeTTSProvider self._edge_provider_class = EdgeTTSProvider - except ImportError: - pass + except ImportError as e: + logger.debug(f"Edge provider discovery failed: {e}") elif provider_id == "kokoro": try: from .kokoro import KokoroTTSProvider self._kokoro_provider_class = KokoroTTSProvider - except ImportError: - pass + except ImportError as e: + logger.debug(f"Kokoro provider discovery failed: {e}") elif provider_id == "elevenlabs": try: from .elevenlabs import ElevenLabsTTSProvider self._elevenlabs_provider_class = ElevenLabsTTSProvider - except ImportError: - pass + except ImportError as e: + logger.debug(f"ElevenLabs provider discovery failed: {e}") elif provider_id == "bark": try: from .bark import BarkProvider self._bark_provider_class = BarkProvider - except ImportError: - pass + except ImportError as e: + logger.debug(f"Bark provider discovery failed: {e}") elif provider_id == "chatterbox": try: from .chatterbox import ChatterboxProvider self._chatterbox_provider_class = ChatterboxProvider - except ImportError: - pass + except ImportError as e: + logger.debug(f"Chatterbox provider discovery failed: {e}") elif provider_id == "qwen": try: from .qwen import QwenTTSProvider self._qwen_provider_class = QwenTTSProvider - except ImportError: - pass + except ImportError as e: + logger.debug(f"Qwen provider discovery failed: {e}") async def create_provider( self, provider_id: str, config: Optional[Dict[str, Any]] = None diff --git a/src/voicegenhub/providers/qwen.py b/src/voicegenhub/providers/qwen.py index e708a59..4a36e9b 100644 --- a/src/voicegenhub/providers/qwen.py +++ b/src/voicegenhub/providers/qwen.py @@ -117,6 +117,13 @@ async def initialize(self) -> None: return try: + # Apply CPU compatibility patches + try: + from ..utils.compatibility import apply_cpu_compatibility_patches + apply_cpu_compatibility_patches() + except ImportError: + pass + from qwen_tts import Qwen3TTSModel logger.info( diff --git a/src/voicegenhub/utils/compatibility.py b/src/voicegenhub/utils/compatibility.py new file mode 100644 index 0000000..0d2050c --- /dev/null +++ b/src/voicegenhub/utils/compatibility.py @@ -0,0 +1,74 @@ +"""Compatibility utilities for problematic dependencies.""" + +import sys +import importlib.metadata +import os +from .logger import get_logger + +logger = get_logger(__name__) + +def apply_cpu_compatibility_patches(): + """Apply patches to ensure stability on CPU-only environments.""" + + # 1. Mock torchcodec if missing. This is a common failure point for Transformers >= 4.51 on CPU. + try: + importlib.metadata.version("torchcodec") + except importlib.metadata.PackageNotFoundError: + logger.info("torchcodec not found, applying compatibility mocks for Transformers/AudioUtils") + + # Mocking sys.modules + class MockTorchCodec: + __version__ = "0.9.1" + class Frame: pass + class Decoder: + def __init__(self, *args, **kwargs): pass + + sys.modules["torchcodec"] = MockTorchCodec() + + # Mocking importlib.metadata.version + # Save a reference to the original version function if not already patched + if not hasattr(importlib.metadata, "_original_version"): + importlib.metadata._original_version = importlib.metadata.version + + def patched_version(package_name): + if package_name == "torchcodec": + return "0.9.1" + return importlib.metadata._original_version(package_name) + + importlib.metadata.version = patched_version + # Some versions of Python/importlib might need patching in different places + # but this is the most common one. + + # 2. Performance/Stability environment variables + # Only set if not already present + if "TRANSFORMERS_ATTENTION_IMPLEMENTATION" not in os.environ: + os.environ["TRANSFORMERS_ATTENTION_IMPLEMENTATION"] = "eager" + + # 3. Patching torch specifically for CPU stability + # We do this lazily to avoid triggering heavy torch imports if they haven't happened yet + if "torch" in sys.modules: + _patch_torch_cuda(sys.modules["torch"]) + +def _patch_torch_cuda(torch): + """Specific patches for torch when it's already loaded.""" + if not torch.cuda.is_available(): + if "CUDA_VISIBLE_DEVICES" not in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = "" + + # Patch is_bf16_supported if it exists in torch.cuda + if hasattr(torch.cuda, "is_bf16_supported"): + # Some libs expect this to return False on CPU instead of crashing + pass + else: + try: + torch.cuda.is_bf16_supported = lambda: False + except Exception: + pass + +def ensure_torchcodec(): + """Specific check for torchcodec to satisfy Transformers >= 4.51.""" + try: + importlib.metadata.version("torchcodec") + except importlib.metadata.PackageNotFoundError: + # If we got here, apply patches if not already done + apply_cpu_compatibility_patches() diff --git a/tests/conftest.py b/tests/conftest.py index ab83f82..1eb53a3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,18 +3,36 @@ @pytest.fixture(autouse=True) -def mock_provider_initialize(monkeypatch): - """Mock all provider initialize methods to avoid slow setup.""" +def mock_provider_initialize(request, monkeypatch): + """Mock all provider initialize methods to avoid slow setup, except for integration tests.""" + # Skip mocking for integration tests or tests that explicitly request real initialization + if "integration" in request.keywords or "no_mock" in request.keywords: + return + from voicegenhub.providers.edge import EdgeTTSProvider from voicegenhub.providers.kokoro import KokoroTTSProvider + # Add other providers as they are added + try: + from voicegenhub.providers.chatterbox import ChatterboxProvider + from voicegenhub.providers.bark import BarkProvider + from voicegenhub.providers.qwen import QwenTTSProvider + providers = [ + EdgeTTSProvider, + KokoroTTSProvider, + ChatterboxProvider, + BarkProvider, + QwenTTSProvider + ] + except ImportError: + providers = [ + EdgeTTSProvider, + KokoroTTSProvider, + ] async def mock_initialize(self): self._initialization_failed = False - - providers = [ - EdgeTTSProvider, - KokoroTTSProvider, - ] + self._initialized = True for provider_class in providers: - monkeypatch.setattr(provider_class, 'initialize', mock_initialize) + if hasattr(provider_class, 'initialize'): + monkeypatch.setattr(provider_class, 'initialize', mock_initialize) diff --git a/tests/integration/test_provider_load.py b/tests/integration/test_provider_load.py new file mode 100644 index 0000000..8441d83 --- /dev/null +++ b/tests/integration/test_provider_load.py @@ -0,0 +1,107 @@ +import pytest +import os +import asyncio +from voicegenhub.providers.factory import provider_factory +from voicegenhub.providers.base import TTSError, TTSRequest, AudioFormat + +@pytest.mark.asyncio +@pytest.mark.integration +class TestProviderLoading: + """ + Tests focused on verifying that providers can be discovered and initialized + in the current environment. This helps catch missing dependencies or + import-time crashes (like the torchcodec issue). + """ + + async def _test_provider_init(self, provider_id): + """Helper to test discovery and initialization of a provider.""" + print(f"\nTesting discovery of '{provider_id}'...") + await provider_factory.discover_provider(provider_id) + + # Check if it was discovered + class_attr = f"_{provider_id}_provider_class" + provider_class = getattr(provider_factory, class_attr, None) + + if provider_class is None: + pytest.skip(f"Provider '{provider_id}' dependencies not installed in this environment.") + return + + print(f"Initializing '{provider_id}'...") + try: + provider = await provider_factory.create_provider(provider_id) + # create_provider calls initialize() + assert provider is not None + print(f"Successfully initialized '{provider_id}'.") + return provider + except (ImportError, TTSError) as e: + pytest.fail(f"Failed to initialize discovered provider '{provider_id}': {e}") + + async def test_chatterbox_load(self): + """Test that Chatterbox can be loaded and initialized.""" + await self._test_provider_init("chatterbox") + + async def test_qwen_load(self): + """Test that Qwen can be loaded and initialized.""" + await self._test_provider_init("qwen") + + async def test_kokoro_load(self): + """Test that Kokoro can be loaded and initialized.""" + await self._test_provider_init("kokoro") + + async def test_edge_load(self): + """Test that Edge TTS can be loaded and initialized.""" + await self._test_provider_init("edge") + + def test_torchcodec_compatibility_mock(self): + """Verify that our torchcodec compatibility mock works if torchcodec is missing.""" + import sys + import importlib.metadata + from voicegenhub.utils.compatibility import apply_cpu_compatibility_patches + + # We can't easily uninstall it here, but we can verify the patched state + # if it was already applied by __init__.py + try: + version = importlib.metadata.version("torchcodec") + assert version is not None + except importlib.metadata.PackageNotFoundError: + # If it's missing, apply the patch now + apply_cpu_compatibility_patches() + version = importlib.metadata.version("torchcodec") + assert version == "0.9.1" + assert "torchcodec" in sys.modules + +@pytest.mark.asyncio +@pytest.mark.integration +class TestProviderExecution: + """ + Smoke tests to ensure that initialized providers can actually generate audio + on the current system (especially CPU). + """ + + @pytest.mark.slow + async def test_chatterbox_generate_smoke(self): + """Small generation test for Chatterbox on CPU.""" + await provider_factory.discover_provider("chatterbox") + if not provider_factory._chatterbox_provider_class: + pytest.skip("Chatterbox not installed") + + provider = await provider_factory.create_provider("chatterbox", config={"device": "cpu"}) + request = TTSRequest(text="Test.", voice_id="chatterbox-default") + + response = await provider.synthesize(request) + assert response.audio_data is not None + assert len(response.audio_data) > 0 + + @pytest.mark.slow + async def test_qwen_generate_smoke(self): + """Small generation test for Qwen on CPU.""" + await provider_factory.discover_provider("qwen") + if not provider_factory._qwen_provider_class: + pytest.skip("Qwen not installed") + + provider = await provider_factory.create_provider("qwen", config={"device": "cpu"}) + request = TTSRequest(text="Test.", voice_id="default") + + response = await provider.synthesize(request) + assert response.audio_data is not None + assert len(response.audio_data) > 0 From 075afd98bcb93296248ce17db65c1a2c1afca76c Mon Sep 17 00:00:00 2001 From: leweex95 Date: Wed, 25 Feb 2026 15:30:45 +0100 Subject: [PATCH 02/13] fix poetry install error --- README.md | 5 ++--- pyproject.toml | 3 +-- src/voicegenhub/__init__.py | 14 ++++++++++++-- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index a262fe8..e779029 100644 --- a/README.md +++ b/README.md @@ -34,13 +34,12 @@ poetry install -E voice-cloning ``` **Voice cloning requirements:** -- TorchCodec (automatically installed with `voice-cloning` extra) - FFmpeg (manual installation required) -- PyTorch ≤ 2.4.x (for TorchCodec compatibility) +- PyTorch (standard version) **On Windows:** Download the "full-shared" FFmpeg build from [ffmpeg.org](https://ffmpeg.org/download.html#build-windows) and add the `bin` directory to your system PATH. -**Note:** If TorchCodec is incompatible with your PyTorch version or FFmpeg is not available, VoiceGenHub will automatically fall back to standard TTS without voice cloning. +**Note:** VoiceGenHub includes a compatibility layer to ensure stable execution on CPU-only systems and prevents common import-time crashes related to experimental dependencies like TorchCodec. Standard TTS and voice cloning mechanisms will automatically fall back to supported audio loaders if needed. ## Usage diff --git a/pyproject.toml b/pyproject.toml index 8f3f5b4..8d4498b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,10 +22,9 @@ chatterbox-tts = {git = "https://github.com/rsxdalv/chatterbox.git", rev = "004a transformers = ">=4.46.3" torchaudio = ">=2.6.0" librosa = ">=0.11.0" -torchcodec = {version = "^0.9.1", optional = true} [tool.poetry.extras] -voice-cloning = ["torchcodec"] +voice-cloning = [] [tool.poetry.group.elevenlabs.dependencies] elevenlabs = "*" diff --git a/src/voicegenhub/__init__.py b/src/voicegenhub/__init__.py index 534e4fc..d16f60c 100644 --- a/src/voicegenhub/__init__.py +++ b/src/voicegenhub/__init__.py @@ -16,12 +16,22 @@ audio = await tts.generate("Hello, world!", voice="en-US-AriaNeural") """ -__version__ = "1.1.4" +__version__ = "1.1.5" __author__ = "leweex95" __email__ = "csibi.levente14@gmail.com" import os -# Set attention implementation to eager before any imports to prevent SDPA warnings + +# Apply compatibility patches as early as possible (before providers or heavy imports) +try: + from .utils.compatibility import apply_cpu_compatibility_patches + apply_cpu_compatibility_patches() +except (ImportError, Exception): + # Fallback to prevent cyclic or import-time issues + os.environ['TRANSFORMERS_ATTENTION_IMPLEMENTATION'] = 'eager' + os.environ['CUDA_VISIBLE_DEVICES'] = '' + +# Additional safety for transformers os.environ['TRANSFORMERS_ATTENTION_IMPLEMENTATION'] = 'eager' # Apply CPU compatibility patches early to prevent import-time crashes in dependencies From 17ef859cbcb026ad467d612dc00b36d2b197e7ef Mon Sep 17 00:00:00 2001 From: leweex95 Date: Wed, 25 Feb 2026 15:40:55 +0100 Subject: [PATCH 03/13] fix attempt --- pyproject.toml | 1 + src/voicegenhub/__init__.py | 14 +++--------- src/voicegenhub/utils/compatibility.py | 31 ++++++++++++++++++-------- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8d4498b..12e019d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ structlog = "^23.2.0" rich = "^13.7.0" pydantic = "^2.11.9" setuptools = "^75.8.0" +typer = ">=0.9.0" spacy = "^3.8" edge-tts = "^7.2.0" kokoro = {git = "https://github.com/hexgrad/kokoro.git", rev = "dfb907a02bba8152ca444717ca5d78747ccb4bec"} diff --git a/src/voicegenhub/__init__.py b/src/voicegenhub/__init__.py index d16f60c..09c55bb 100644 --- a/src/voicegenhub/__init__.py +++ b/src/voicegenhub/__init__.py @@ -22,26 +22,18 @@ import os -# Apply compatibility patches as early as possible (before providers or heavy imports) +# Apply CPU compatibility patches early to prevent import-time crashes in dependencies try: from .utils.compatibility import apply_cpu_compatibility_patches apply_cpu_compatibility_patches() -except (ImportError, Exception): - # Fallback to prevent cyclic or import-time issues +except Exception: + # Fail silently to avoid breaking the whole library if the tool itself has issues os.environ['TRANSFORMERS_ATTENTION_IMPLEMENTATION'] = 'eager' os.environ['CUDA_VISIBLE_DEVICES'] = '' # Additional safety for transformers os.environ['TRANSFORMERS_ATTENTION_IMPLEMENTATION'] = 'eager' -# Apply CPU compatibility patches early to prevent import-time crashes in dependencies -try: - from .utils.compatibility import apply_cpu_compatibility_patches - apply_cpu_compatibility_patches() -except Exception: - # Fail silently to avoid breaking the whole library if the tool itself has issues - pass - # Import core classes when available try: from .config.settings import Settings # noqa: F401 diff --git a/src/voicegenhub/utils/compatibility.py b/src/voicegenhub/utils/compatibility.py index 0d2050c..ae46d98 100644 --- a/src/voicegenhub/utils/compatibility.py +++ b/src/voicegenhub/utils/compatibility.py @@ -10,20 +10,33 @@ def apply_cpu_compatibility_patches(): """Apply patches to ensure stability on CPU-only environments.""" - # 1. Mock torchcodec if missing. This is a common failure point for Transformers >= 4.51 on CPU. + # 1. Mock torchcodec if missing or corrupt. This is a common failure point for Transformers >= 4.51 on CPU. + # Handling PackageNotFoundError for missing, or Exception (e.g. IndexError) for corrupted installs. + is_broken_or_missing = False try: importlib.metadata.version("torchcodec") - except importlib.metadata.PackageNotFoundError: - logger.info("torchcodec not found, applying compatibility mocks for Transformers/AudioUtils") + except (importlib.metadata.PackageNotFoundError, Exception): + is_broken_or_missing = True + + if is_broken_or_missing or "torchcodec" not in sys.modules: + logger.info("torchcodec not found or corrupt, applying compatibility mocks for Transformers/AudioUtils") # Mocking sys.modules - class MockTorchCodec: - __version__ = "0.9.1" - class Frame: pass - class Decoder: - def __init__(self, *args, **kwargs): pass + import types + from importlib.machinery import ModuleSpec + + mock_codec = types.ModuleType("torchcodec") + mock_codec.__version__ = "0.9.1" + mock_codec.__spec__ = ModuleSpec("torchcodec", None) + + class Frame: pass + class Decoder: + def __init__(self, *args, **kwargs): pass + + mock_codec.Frame = Frame + mock_codec.Decoder = Decoder - sys.modules["torchcodec"] = MockTorchCodec() + sys.modules["torchcodec"] = mock_codec # Mocking importlib.metadata.version # Save a reference to the original version function if not already patched From 075153afda5a809528c95329036e7bb0702cda6b Mon Sep 17 00:00:00 2001 From: leweex95 Date: Wed, 25 Feb 2026 15:41:40 +0100 Subject: [PATCH 04/13] fix attempt --- src/voicegenhub/utils/compatibility.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/voicegenhub/utils/compatibility.py b/src/voicegenhub/utils/compatibility.py index ae46d98..11048eb 100644 --- a/src/voicegenhub/utils/compatibility.py +++ b/src/voicegenhub/utils/compatibility.py @@ -24,15 +24,15 @@ def apply_cpu_compatibility_patches(): # Mocking sys.modules import types from importlib.machinery import ModuleSpec - + mock_codec = types.ModuleType("torchcodec") mock_codec.__version__ = "0.9.1" mock_codec.__spec__ = ModuleSpec("torchcodec", None) - + class Frame: pass class Decoder: def __init__(self, *args, **kwargs): pass - + mock_codec.Frame = Frame mock_codec.Decoder = Decoder From 2016f24bf54630985102081ce3eb18ed49c7f9a8 Mon Sep 17 00:00:00 2001 From: leweex95 Date: Wed, 25 Feb 2026 21:13:50 +0100 Subject: [PATCH 05/13] qwen3 tts on kaggle gpu implemented --- 20260225/voicegenhub-qwen3-tts-gpu.log | 91 +++++ README.md | 129 ++++++- docs/benchmarks_and_performance.md | 27 ++ docs/cloning_and_design.md | 51 +++ docs/installation.md | 66 ++++ docs/kaggle_gpu.md | 52 +++ docs/licensing.md | 19 + docs/providers.md | 51 +++ pyproject.toml | 2 +- src/voicegenhub/cli.py | 68 +++- src/voicegenhub/kaggle/__init__.py | 5 + src/voicegenhub/kaggle/pipeline.py | 482 +++++++++++++++++++++++++ 12 files changed, 1032 insertions(+), 11 deletions(-) create mode 100644 20260225/voicegenhub-qwen3-tts-gpu.log create mode 100644 docs/benchmarks_and_performance.md create mode 100644 docs/cloning_and_design.md create mode 100644 docs/installation.md create mode 100644 docs/kaggle_gpu.md create mode 100644 docs/licensing.md create mode 100644 docs/providers.md create mode 100644 src/voicegenhub/kaggle/__init__.py create mode 100644 src/voicegenhub/kaggle/pipeline.py diff --git a/20260225/voicegenhub-qwen3-tts-gpu.log b/20260225/voicegenhub-qwen3-tts-gpu.log new file mode 100644 index 0000000..3f1dc52 --- /dev/null +++ b/20260225/voicegenhub-qwen3-tts-gpu.log @@ -0,0 +1,91 @@ +[{"stream_name":"stderr","time":6.465816424,"data":"0.00s - Debugger warning: It seems that frozen modules are being used, which may\n"} +,{"stream_name":"stderr","time":6.46588892,"data":"0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off\n"} +,{"stream_name":"stderr","time":6.465894936,"data":"0.00s - to python to disable frozen modules.\n"} +,{"stream_name":"stderr","time":6.465899538,"data":"0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.\n"} +,{"stream_name":"stderr","time":7.061415414,"data":"0.00s - Debugger warning: It seems that frozen modules are being used, which may\n"} +,{"stream_name":"stderr","time":7.061447949,"data":"0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off\n"} +,{"stream_name":"stderr","time":7.06145249,"data":"0.00s - to python to disable frozen modules.\n"} +,{"stream_name":"stderr","time":7.061455112,"data":"0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.\n"} +,{"stream_name":"stdout","time":13.773725016,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.4/61.4 kB 5.3 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":13.927271089,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.0/44.0 kB 3.5 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":13.976515574,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.4/61.4 kB 5.3 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":13.97657512,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.0/44.0 kB 3.5 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":14.365197171,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 63.9/63.9 kB 4.8 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":14.567451397,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 63.9/63.9 kB 4.8 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":16.304112232,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.5/113.5 kB 9.6 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":16.418073368,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.0/12.0 MB 103.9 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":16.506303775,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.5/113.5 kB 9.6 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":16.506331495,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.0/12.0 MB 103.9 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":16.574433696,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 86.6 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":16.596169453,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 566.4/566.4 kB 39.5 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":16.776377537,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 86.6 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":16.77642332,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 566.4/566.4 kB 39.5 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":33.162250396,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.4/8.4 MB 31.7 MB/s eta 0:00:00\n"} +,{"stream_name":"stdout","time":33.364164004,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.4/8.4 MB 31.7 MB/s eta 0:00:00\n"} +,{"stream_name":"stderr","time":79.42841178,"data":"2026-02-25 18:54:09.680556: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n"} +,{"stream_name":"stderr","time":79.428609104,"data":"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n"} +,{"stream_name":"stderr","time":79.428630478,"data":"E0000 00:00:1772045650.000905 23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n"} +,{"stream_name":"stderr","time":79.428635058,"data":"E0000 00:00:1772045650.052818 23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n"} +,{"stream_name":"stderr","time":79.428638808,"data":"W0000 00:00:1772045650.692242 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.428642335,"data":"W0000 00:00:1772045650.692291 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.428645726,"data":"W0000 00:00:1772045650.692294 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.428733057,"data":"W0000 00:00:1772045650.692297 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.633527785,"data":"2026-02-25 18:54:09.680556: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n"} +,{"stream_name":"stderr","time":79.633570118,"data":"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n"} +,{"stream_name":"stderr","time":79.633578121,"data":"E0000 00:00:1772045650.000905 23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n"} +,{"stream_name":"stderr","time":79.63358432,"data":"E0000 00:00:1772045650.052818 23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n"} +,{"stream_name":"stderr","time":79.633589577,"data":"W0000 00:00:1772045650.692242 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.63359445,"data":"W0000 00:00:1772045650.692291 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.633599539,"data":"W0000 00:00:1772045650.692294 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.633616186,"data":"W0000 00:00:1772045650.692297 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.633619156,"data":"\n"} +,{"stream_name":"stderr","time":79.633621351,"data":"2026-02-25 18:54:09.680556: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n"} +,{"stream_name":"stderr","time":79.63362412,"data":"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n"} +,{"stream_name":"stderr","time":79.633626418,"data":"E0000 00:00:1772045650.000905 23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n"} +,{"stream_name":"stderr","time":79.633628938,"data":"E0000 00:00:1772045650.052818 23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n"} +,{"stream_name":"stderr","time":79.633632435,"data":"W0000 00:00:1772045650.692242 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.633634944,"data":"W0000 00:00:1772045650.692291 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.633637372,"data":"W0000 00:00:1772045650.692294 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":79.633639898,"data":"W0000 00:00:1772045650.692297 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} +,{"stream_name":"stderr","time":98.077741199,"data":"/bin/sh: 1: sox: not found\n"} +,{"stream_name":"stderr","time":98.084522944,"data":"/bin/sh: 1: sox: not found\n"} +,{"stream_name":"stderr","time":98.084561306,"data":"WARNING:sox:SoX could not be found!\n"} +,{"stream_name":"stderr","time":98.084569906,"data":"\n"} +,{"stream_name":"stderr","time":98.084573978,"data":" If you do not have SoX, proceed here:\n"} +,{"stream_name":"stderr","time":98.084577812,"data":" - - - http://sox.sourceforge.net/ - - -\n"} +,{"stream_name":"stderr","time":98.084581684,"data":"\n"} +,{"stream_name":"stderr","time":98.084585536,"data":" If you do (or think that you should) have SoX, double-check your\n"} +,{"stream_name":"stderr","time":98.084589885,"data":" path variables.\n"} +,{"stream_name":"stderr","time":98.084593742,"data":" \n"} +,{"stream_name":"stderr","time":98.084597023,"data":"\n"} +,{"stream_name":"stderr","time":98.084600338,"data":"/bin/sh: 1: sox: not found\n"} +,{"stream_name":"stderr","time":98.084603859,"data":"WARNING:sox:SoX could not be found!\n"} +,{"stream_name":"stderr","time":98.084607247,"data":"\n"} +,{"stream_name":"stderr","time":98.084610416,"data":" If you do not have SoX, proceed here:\n"} +,{"stream_name":"stderr","time":98.084613818,"data":" - - - http://sox.sourceforge.net/ - - -\n"} +,{"stream_name":"stderr","time":98.084618784,"data":"\n"} +,{"stream_name":"stderr","time":98.084622464,"data":" If you do (or think that you should) have SoX, double-check your\n"} +,{"stream_name":"stderr","time":98.084634034,"data":" path variables.\n"} +,{"stream_name":"stderr","time":98.084636545,"data":" \n"} +,{"stream_name":"stdout","time":99.940193512,"data":"CUDA available: True\n"} +,{"stream_name":"stdout","time":99.94023427,"data":"GPU: Tesla P100-PCIE-16GB\n"} +,{"stream_name":"stdout","time":99.940239859,"data":"Loading model: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice\n"} +,{"stream_name":"stdout","time":134.503678322,"data":"Generating speech...\n"} +,{"stream_name":"stderr","time":134.59886615,"data":"Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.\n"} +,{"stream_name":"stderr","time":134.598897308,"data":"\n"} +,{"stream_name":"stderr","time":134.598903838,"data":"Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.\n"} +,{"stream_name":"stdout","time":160.741624344,"data":"Audio saved to /kaggle/working/qwen3_tts.wav\n"} +,{"stream_name":"stdout","time":160.741672884,"data":"Sample rate: 24000 Hz, Duration: 10.00s\n"} +,{"stream_name":"stderr","time":165.862483141,"data":"/usr/local/lib/python3.12/dist-packages/mistune.py:435: SyntaxWarning: invalid escape sequence '\\|'\n"} +,{"stream_name":"stderr","time":165.86251786,"data":" cells[i][c] = re.sub('\\\\\\\\\\|', '|', cell)\n"} +,{"stream_name":"stderr","time":166.010702784,"data":"/usr/local/lib/python3.12/dist-packages/nbconvert/filters/filter_links.py:36: SyntaxWarning: invalid escape sequence '\\_'\n"} +,{"stream_name":"stderr","time":166.010732274,"data":" text = re.sub(r'_', '\\_', text) # Escape underscores in display text\n"} +,{"stream_name":"stderr","time":166.570315686,"data":"/usr/local/lib/python3.12/dist-packages/traitlets/traitlets.py:2915: FutureWarning: --Exporter.preprocessors=[\"remove_papermill_header.RemovePapermillHeader\"] for containers is deprecated in traitlets 5.0. You can pass `--Exporter.preprocessors item` ... multiple times to add items to a list.\n"} +,{"stream_name":"stderr","time":166.570350872,"data":" warn(\n"} +,{"stream_name":"stderr","time":166.596721549,"data":"[NbConvertApp] Converting notebook __notebook__.ipynb to notebook\n"} +,{"stream_name":"stderr","time":166.933120221,"data":"[NbConvertApp] Writing 147162 bytes to __notebook__.ipynb\n"} +,{"stream_name":"stderr","time":169.293420137,"data":"/usr/local/lib/python3.12/dist-packages/traitlets/traitlets.py:2915: FutureWarning: --Exporter.preprocessors=[\"nbconvert.preprocessors.ExtractOutputPreprocessor\"] for containers is deprecated in traitlets 5.0. You can pass `--Exporter.preprocessors item` ... multiple times to add items to a list.\n"} +,{"stream_name":"stderr","time":169.293456667,"data":" warn(\n"} +,{"stream_name":"stderr","time":169.314149193,"data":"[NbConvertApp] Converting notebook __notebook__.ipynb to html\n"} +,{"stream_name":"stderr","time":170.145421049,"data":"[NbConvertApp] Writing 396049 bytes to __results__.html\n"} +] diff --git a/README.md b/README.md index e779029..087fd3e 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,6 @@ Simple, user-friendly Text-to-Speech (TTS) library with CLI and Python API. Supports multiple free and commercial TTS providers. -## Installation - -```bash -pip install voicegenhub -# or -poetry add voicegenhub -``` - ### Optional Dependencies - **Microsoft Edge TTS** (free, cloud-based) @@ -248,6 +240,127 @@ poetry install --with qwen - `instruct`: Emotion/style instruction (for CustomVoice) or voice description (for VoiceDesign) - `temperature`, `top_p`, `top_k`, `repetition_penalty`, `max_new_tokens`: Advanced sampling parameters +### Qwen3-TTS on Kaggle P100 GPU + +Run the full Qwen3-TTS pipeline on a **free Kaggle P100 GPU**. VoiceGenHub automatically pushes a notebook to Kaggle, runs it with GPU acceleration, polls for completion, and downloads the audio to a local timestamped folder — no Kaggle web UI interaction required. + +#### Prerequisites + +1. **Install the Kaggle CLI:** + ```bash + pip install kaggle + ``` + +2. **Set up Kaggle API credentials** (`~/.kaggle/kaggle.json`): + - Go to https://www.kaggle.com/settings → API → Create New Token + - Save the downloaded `kaggle.json` to `~/.kaggle/kaggle.json` + - On Windows: `%USERPROFILE%\.kaggle\kaggle.json` + +3. **Enable internet on Kaggle notebooks** (required for `pip install`): + - Kaggle by default allows internet access from notebooks (no action needed). + +#### Usage + + +```bash + + + + +# Basic usage — outputs to a timestamped folder (YYYYMMDD_HHMMSS) +poetry run voicegenhub synthesize "Hello from the Kaggle GPU!" --provider qwen --gpu p100 + + + + + +> To use dual T4 GPUs, use `--gpu t4`. To force CPU, use `--cpu` (or omit both flags for default CPU mode). + + + +# Specify voice and language +poetry run voicegenhub synthesize "This is a test." --provider qwen --voice Ryan --language en --gpu p100 + + + +# Chinese with native speaker +poetry run voicegenhub synthesize "你好,这是一个测试。" --provider qwen --voice Serena --language zh --gpu p100 + + + +# Explicit output directory and filename +poetry run voicegenhub synthesize "Big model test." \ + --provider qwen \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ + --output-dir 20260225_153045 \ + --output-filename my_audio.wav \ + --gpu p100 + + + +# Adjust polling timeout (default 60 min) +poetry run voicegenhub synthesize "Long text..." --provider qwen --gpu p100 --timeout 90 --poll-interval 30 +| `--gpu-type` | `p100` | Kaggle GPU type: `p100` (default) or `t4` (dual T4, optional) | +``` + + +#### All `synthesize` flags for Kaggle GPU + +| Flag | Default | Description | +|------|---------|-------------| +| `TEXT` | *(required)* | Text to synthesize | +| `--provider` | *(required)* | TTS provider: `qwen`, `chatterbox`, etc. | +| `--voice`, `-v` | `Ryan` | Speaker name: `Ryan`, `Serena`, etc. | +| `--language`, `-l` | `en` | Language code: `en`, `zh`, `fr`, etc. | +| `--model`, `-m` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | HuggingFace model ID | +| `--output-dir`, `-o` | `YYYYMMDD_HHMMSS` (current datetime) | Local folder for the downloaded audio | +| `--output-filename` | `qwen3_tts.wav` | Filename for the generated audio | +| `--gpu [p100|t4]` | *(optional)* | Run remotely on Kaggle GPU (specify `p100` or `t4`) | +| `--cpu` | *(optional)* | Force CPU mode (default if neither flag is set) | +| `--timeout` | `60` | Timeout in minutes to wait for the kernel | +| `--poll-interval` | `60` | Status polling interval in seconds | + +#### How it works + +1. **Build** — VoiceGenHub generates a Jupyter notebook with your text, voice, and model parameters. +2. **Push** — The notebook is pushed to Kaggle with `enable_gpu: true` (P100). +3. **Run** — Kaggle executes the notebook: installs `qwen-tts`, loads the model on the GPU, generates audio. +4. **Poll** — VoiceGenHub polls `kaggle kernels status` every 60 seconds until completion. +5. **Download** — The `.wav` file is fetched with `kaggle kernels output` and placed in your local output directory. + + + + +**Note:** If you do not specify `--gpu` or `--cpu`, VoiceGenHub will run on CPU by default. For Qwen3-TTS and Chatterbox, running on CPU will print a **BIG VISIBLE WARNING** and may be extremely slow or fail. Use `--gpu p100` or `--gpu t4` for remote GPU. Use `--cpu` to force CPU mode explicitly. + +**The output directory defaults to the current datetime** (e.g. `20260225_153045/qwen3_tts.wav`). + +--- + +## ⚠️ IMPORTANT: GPU Requirement for Qwen3/Chatterbox + +**Qwen3-TTS and Chatterbox require a GPU for practical generation speed.** + +- If you run these providers **without** `--gpu` (or on a CPU-only machine), you will see a **BIG WARNING** and generation will be extremely slow or may fail. +- Always use `--gpu` for Qwen3 and Chatterbox unless you are on a local machine with a powerful GPU. + +**Example warning:** + +``` +WARNING: Qwen3-TTS and Chatterbox require a GPU for fast generation. Use --gpu (and optionally --gpu-type) to run on Kaggle or your local GPU. CPU-only runs are not recommended and may fail. +``` + +#### Available Qwen3-TTS Models on Kaggle GPU + +| Model | Size | Speed | Best For | +|-------|------|-------|----------| +| `Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice` | 600M | Fast | Quick iterations | +| `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | 1.7B | Normal | **Best quality** ✅ | + +> **Tip:** The 1.7B model is recommended for production quality. A P100 has 16 GB VRAM — more than enough for the 1.7B model at float16. + +--- + ### Bark ```bash diff --git a/docs/benchmarks_and_performance.md b/docs/benchmarks_and_performance.md new file mode 100644 index 0000000..f99a724 --- /dev/null +++ b/docs/benchmarks_and_performance.md @@ -0,0 +1,27 @@ +# Performance and Benchmarks + +VoiceGenHub is designed for both local CPU-only systems and GPU-accelerated environments. + +## Performance Comparison (Single Job) + +| Provider | Quality (MOS) | Startup Time | Sequential (per req) | Async (3x parallel) | Model Size | Commercial | +|----------|---------------|--------------|---------------------|-------------------|------------|------------| +| **Edge TTS** | 3.8/5 | 4.9s | 3.2s | 2.5s | 0MB (cloud) | ✅ Free | +| **Kokoro** | 3.5/5 | 94s | 14.2s | 2.5s | 625MB | ✅ Apache 2.0 | +| **Bark** | 4.2/5 | 180s | 25-40s | 8-12s | 4GB | ✅ MIT | +| **Chatterbox** | 4.3/5 | 120s | 15-30s | 5-15s | 3.7GB | ✅ MIT | +| **ElevenLabs** | 4.5/5* | 2s | 3-5s | 2-3s | 0MB (cloud) | ⚠️ Paid API | + +*ElevenLabs quality estimate based on reputation; not yet tested.* + +## Concurrency Analysis (Chatterbox) + +- **Memory Safety**: Chatterbox uses a **shared model instance** (3.6GB) across all threads — **no duplication**. +- **Performance**: ~2.8x speedup at 4 threads on CPU. Optimal thread count: **2-4 threads**. +- **Async Concurrency**: Safe to use 2-8 concurrent threads without OOM risk. + +## [View Concurrency Plot](assets/concurrency_plot.html) +Interactive performance analysis showing speedup curves, memory usage, and timing breakdowns. + +--- +*For more details on Kaggle GPU benchmarks, see the remote GPU documentation.* diff --git a/docs/cloning_and_design.md b/docs/cloning_and_design.md new file mode 100644 index 0000000..cf1e02e --- /dev/null +++ b/docs/cloning_and_design.md @@ -0,0 +1,51 @@ +# Voice Cloning and Design + +VoiceGenHub supports both zero-shot voice cloning (from audio samples) and voice design (from textual descriptions). + +## 1. Voice Cloning with [Chatterbox](https://github.com/rsxdalv/chatterbox) + +### Steps + +1. **Generate a Reference Audio** (or use an existing sample): + ```bash + voicegenhub synthesize "Sample text for cloning." \ + --provider kokoro \ + --voice kokoro-am_michael \ + --output reference.wav + ``` + +2. **Clone the Voice**: + ```bash + voicegenhub synthesize "Your text to be synthesized in the cloned voice." \ + --provider chatterbox \ + --audio-prompt reference.wav \ + --output cloned_voice.wav + ``` + +3. **Adjust Emotion and Style**: + ```bash + voicegenhub synthesize "Your text." \ + --provider chatterbox \ + --audio-prompt reference.wav \ + --exaggeration 0.8 \ + --cfg-weight 0.7 + ``` + +### Tips for Better Quality +- Use clear, noise-free reference audio (5-10 seconds recommended). +- Chatterbox supports **multilingual cloning** (clone any language, synthesize in any other language). + +## 2. Voice Design with [Qwen 3 TTS](https://github.com/QwenLM/Qwen3-TTS) + +*Requires `Qwen3-TTS-VoiceDesign` model for full control, available via Python API or remote GPU.* + +### Qwen 3 TTS Voice Design Features + +- **Natural Language Instruction**: Design custom voices using descriptions. +- **Example Voice Design**: + - `"Female, 25 years old, cheerful and energetic, slightly high-pitched with playful intonation"` + - `"Male, 17 years old, gaining confidence, deeper breath support, vowels tighten when nervous"` + - `"Elderly male, 70 years old, wise and gentle, slightly raspy with warm timbre"` + +--- +*For more details on Qwen 3 TTS design modes, see the [Qwen 3 TTS documentation](https://github.com/QwenLM/Qwen3-TTS).* diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..a7941d1 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,66 @@ +# Installation and Requirements + +Detailed installation guide for various TTS providers and optional features. + +## Basic Installation + +```bash +pip install voicegenhub +``` + +## Optional Provider Dependencies + +To use certain providers, you need to install their respective dependencies: + +```bash +# Kokoro TTS (Lightweight, self-hosted) +pip install voicegenhub[kokoro] + +# Bark TTS (High Quality, MIT) +pip install voicegenhub[bark] + +# Chatterbox TTS (High Quality, MIT) +pip install chatterbox-tts + +# Qwen 3 TTS (State-of-the-Art, Apache 2.0) +pip install voicegenhub[qwen] + +# ElevenLabs TTS (Commercial) +pip install elevenlabs +``` + +--- + +## 2. Dependencies + +### Voice Cloning Requirements (Chatterbox) + +For voice cloning features with Chatterbox TTS: + +```bash +pip install voicegenhub[voice-cloning] +``` + +**System Requirements:** +- **FFmpeg**: Required when `torchcodec` is installed for voice cloning. +- **PyTorch**: Required for local model execution. + +**Windows Installations**: Download the "full-shared" FFmpeg build from [ffmpeg.org](https://ffmpeg.org/download.html#build-windows) and add the `bin` directory to your system PATH. + +--- + +## Technical Note: CUDA and CPU Execution + +- VoiceGenHub automatically detects if a GPU is available. +- For **Chatterbox** and **Bark**, if no GPU is found, the library will fall back to **CPU execution**. +- For **Qwen 3 TTS**, high-quality models (1.7B) are recommended for **GPU acceleration** (remote or local). + +--- + +## Windows & Python 3.13+ (Kokoro) + +On Windows with Python 3.13+, **Kokoro TTS** may require Microsoft Visual C++ Build Tools for compilation if pre-built wheels are not available. + +1. Download [Microsoft Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/). +2. Select "Desktop development with C++" workload. +3. Restart terminal and retry installation. diff --git a/docs/kaggle_gpu.md b/docs/kaggle_gpu.md new file mode 100644 index 0000000..de4bcd9 --- /dev/null +++ b/docs/kaggle_gpu.md @@ -0,0 +1,52 @@ +# Kaggle Remote GPU Generation + +Generate high-quality Qwen3-TTS audio using remote Kaggle GPUs (P100 or T4x2). This is useful for high-quality 1.7B models when you don't have a local GPU. + +## Prerequisites + +1. **Kaggle API Credentials**: + - Go to [Kaggle Settings](https://www.kaggle.com/settings) → API → Create New Token. + - Save the `kaggle.json` to `~/.kaggle/kaggle.json` (on Windows: `%USERPROFILE%\.kaggle\kaggle.json`). +2. **Kaggle CLI**: + ```bash + pip install kaggle + ``` +3. **Kaggle Internet Access**: + - Ensure your Kaggle account has phone verification completed (allows internet access in kernels). + +## Usage + +Use the `--gpu` flag with the `synthesize` command to trigger remote generation. + +### P100 GPU (default) + +```bash +voicegenhub synthesize "Hello from the remote P100!" --gpu +``` + +### T4 x 2 GPU + +```bash +voicegenhub synthesize "Hello from the remote T4!" --gpu --gpu-type t4 +``` + +### Advanced Usage + +```bash +voicegenhub synthesize "Chinese test." \ + --gpu \ + --gpu-type p100 \ + --voice Serena \ + --language zh \ + --output ./remote_output/serena.wav +``` + +## How It Works + +1. **Automation**: VoiceGenHub generates a Jupyter notebook cell-by-cell. +2. **Deployment**: It pushes the notebook to Kaggle using the specified accelerator (`nvidia-p100-1` or `nvidia-t4-2`). +3. **Execution**: On Kaggle, the notebook installs necessary dependencies (`transformers`, `qwen-tts`), loads the model onto the GPU, and generates the audio. +4. **Syncing**: The CLI polls for completion and automatically downloads the generated `.wav` file into a local timestamped directory (or your specified output path). + +--- +*Note: Remote generation takes approximately 2-4 minutes due to environment setup on Kaggle's side.* diff --git a/docs/licensing.md b/docs/licensing.md new file mode 100644 index 0000000..ef0e9db --- /dev/null +++ b/docs/licensing.md @@ -0,0 +1,19 @@ +# Licensing and Commercial Usage + +VoiceGenHub is compatible with multiple free and commercial TTS licenses. + +## Commercially Safe Models (summary) +- **Bark** (MIT License) - Unrestricted commercial use, no attribution required. +- **Chatterbox** (MIT License) - Unrestricted commercial use, no attribution required. +- **Qwen 3 TTS** (Apache 2.0) - Commercial use allowed, attribution required. +- **Kokoro** (Apache 2.0) - Commercial use allowed, attribution required. +- **Edge TTS** (Microsoft) - Commercial use allowed. +- **ElevenLabs** (Paid API) - Commercial use with valid subscription. + +### Provider Licenses (links) +- **Edge TTS (Microsoft)**: [Microsoft Terms of Use](https://www.microsoft.com/en-us/legal/terms-of-use) +- **Kokoro TTS**: [Apache License 2.0](https://github.com/hexgrad/kokoro/blob/main/LICENSE) +- **ElevenLabs TTS**: [ElevenLabs Terms of Service](https://elevenlabs.io/terms) +- **Bark TTS**: [MIT License](https://github.com/suno-ai/bark/blob/main/LICENSE) +- **Chatterbox TTS**: [MIT License](https://github.com/rsxdalv/chatterbox/blob/main/LICENSE) +- **Qwen 3 TTS**: [Apache License 2.0](https://github.com/QwenLM/Qwen3-TTS/blob/main/LICENSE) diff --git a/docs/providers.md b/docs/providers.md new file mode 100644 index 0000000..9d15ca9 --- /dev/null +++ b/docs/providers.md @@ -0,0 +1,51 @@ +# TTS Providers Detail + +VoiceGenHub supports multiple free and commercial TTS providers. + +## [Chatterbox TTS](https://github.com/rsxdalv/chatterbox) (MIT) +Multilingual TTS with emotion control and voice cloning. + +### Features +- **Model selection via voice**: Choose between standard, turbo, or multilingual models. +- Emotion/intensity control with `exaggeration` parameter (0.0-1.0). +- Zero-shot voice cloning from audio samples. +- Built-in Perth watermarking for responsible AI. + +### Supported Languages +ar, da, de, el, en, es, fi, fr, he, hi, it, ja, ko, ms, nl, no, pl, pt, ru, sv, sw, tr, zh + +--- + +## [Qwen 3 TTS](https://github.com/QwenLM/Qwen3-TTS) (Apache 2.0) +State-of-the-art multilingual TTS with voice design and cloning. + +### Features +- **Three generation modes**: CustomVoice, VoiceDesign, VoiceClone. +- **10 languages**: Chinese, English, French, German, Italian, Japanese, Korean, Portuguese, Russian, Spanish. +- **Native speakers**: Automatic selection of native speakers per language. +- **Ultra-low latency**: Streaming generation supported. + +--- + +## [Bark TTS](https://github.com/suno-ai/bark) (MIT) +Self-hosted high-naturalness TTS with prosody control. + +### Features +- Prosody markers: `[laughs]`, `[sighs]`, `[pause]`, `[whisper]`. +- 100+ speaker presets. +- Sound effects generation. + +--- + +## [Kokoro TTS](https://github.com/hexgrad/kokoro) (Apache 2.0) +Self-hosted, extremely lightweight and fast. + +--- + +## [Microsoft Edge TTS](https://github.com/rany2/edge-tts) (Free Cloud) +Fast, high-quality cloud-based voices. + +--- + +## [ElevenLabs TTS](https://elevenlabs.io) (Commercial) +Premium high-quality voices (requires API key). diff --git a/pyproject.toml b/pyproject.toml index 12e019d..5feb3b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "voicegenhub" -version = "1.1.5" +version = "2.0.0" description = "Simple Text-to-Speech library supporting multiple providers" authors = ["leweex95 "] readme = "README.md" diff --git a/src/voicegenhub/cli.py b/src/voicegenhub/cli.py index 323f89e..e5368c9 100644 --- a/src/voicegenhub/cli.py +++ b/src/voicegenhub/cli.py @@ -8,6 +8,7 @@ import sys import tempfile import threading +from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Optional @@ -313,6 +314,16 @@ def cli(): "--pitch", type=float, default=1.0, help="Speech pitch (0.5-2.0, default 1.0)" ) @click.option("--provider", "-p", help="TTS provider") +@click.option( + "--gpu", + type=click.Choice(["p100", "t4"]), + help="Use remote Kaggle GPU for generation (currently Qwen3-TTS only)", +) +@click.option( + "--cpu", + is_flag=True, + help="Use local CPU for generation (default)", +) @click.option( "--lowpass", type=int, @@ -387,11 +398,64 @@ def cli(): ) def synthesize( texts, voice, language, output, format, rate, pitch, provider, - lowpass, normalize, distortion, noise, reverb, pitch_shift, + gpu, cpu, lowpass, normalize, distortion, noise, reverb, pitch_shift, exaggeration, cfg_weight, audio_prompt, turbo, multilingual, instruct, ref_audio, ref_text ): - """Generate speech from text(s).""" + """Generate speech from text(s). Use --gpu [p100|t4] for remote Kaggle GPU acceleration.""" + # Redirect to Kaggle pipeline if --gpu is specified + if gpu: + if len(texts) > 1: + click.echo("Error: --gpu currently supports only single text generation", err=True) + sys.exit(1) + + from .kaggle.pipeline import KaggleQwenPipeline + pipeline = KaggleQwenPipeline() + + # Determine output directory and filename using the requested format + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + suffix = f"_{gpu}" + output_dir_name = f"{timestamp}{suffix}" + + if output: + output_path = Path(output) + # If user provided a path, we use it as base but still follow the folder naming convention if it's a dir + if not output_path.suffix: + output_dir = str(output_path / output_dir_name) + output_filename = "qwen3_tts.wav" + else: + output_dir = str(output_path.parent / output_dir_name) + output_filename = output_path.name + else: + output_dir = output_dir_name + output_filename = "qwen3_tts.wav" + + try: + result_path = pipeline.run( + text=texts[0], + voice=voice or "Ryan", + language=language or "en", + output_dir=output_dir, + output_filename=output_filename, + gpu_type=gpu, # Now using the gpu value directly as p100 or t4 + ) + click.echo(f"SUCCESS: Remote audio available at: {result_path.absolute()}") + return + except Exception as e: + click.echo(f"Error during remote generation: {e}", err=True) + sys.exit(1) + + # For local CPU runs, ensure directory structure matches requested format + if not output: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = f"{timestamp}_cpu" + os.makedirs(output_dir, exist_ok=True) + # For single text, we still want to respect the output_dir + if len(texts) == 1: + output = os.path.join(output_dir, "output.wav") + else: + output = os.path.join(output_dir, "batch") + # Validate provider immediately supported_providers = [ "edge", "kokoro", "elevenlabs", "bark", "chatterbox", "qwen" diff --git a/src/voicegenhub/kaggle/__init__.py b/src/voicegenhub/kaggle/__init__.py new file mode 100644 index 0000000..7554829 --- /dev/null +++ b/src/voicegenhub/kaggle/__init__.py @@ -0,0 +1,5 @@ +"""Kaggle GPU pipeline for remote TTS generation.""" + +from .pipeline import KaggleQwenPipeline + +__all__ = ["KaggleQwenPipeline"] diff --git a/src/voicegenhub/kaggle/pipeline.py b/src/voicegenhub/kaggle/pipeline.py new file mode 100644 index 0000000..98827d2 --- /dev/null +++ b/src/voicegenhub/kaggle/pipeline.py @@ -0,0 +1,482 @@ +""" +Kaggle GPU Pipeline for Qwen3-TTS Remote Generation. + +Pushes a notebook to Kaggle, runs it on a free P100 GPU, +polls for completion, and downloads the generated audio automatically. +""" + +import json +import os +import shutil +import subprocess +import sys +import tempfile +import textwrap +import time +import zipfile +from pathlib import Path +from typing import Optional + +from ..utils.logger import get_logger + +logger = get_logger(__name__) + +_DEFAULT_SETTINGS_PATH = Path(__file__).parent / "config" / "kaggle_settings.json" +_KERNEL_SLUG = "voicegenhub-qwen3-tts" + + +def _load_settings() -> dict: + """Load Kaggle pipeline settings from config JSON.""" + try: + with open(_DEFAULT_SETTINGS_PATH) as f: + return json.load(f) + except FileNotFoundError: + return { + "deployment_timeout_minutes": 60, + "polling_interval_seconds": 60, + "retry_interval_seconds": 60, + } + + +def _detect_kaggle_username() -> str: + """Detect Kaggle username from credentials or env.""" + # 1. Environment variable + if os.environ.get("KAGGLE_USERNAME"): + return os.environ["KAGGLE_USERNAME"] + + # 2. ~/.kaggle/kaggle.json + kaggle_json = Path.home() / ".kaggle" / "kaggle.json" + if kaggle_json.exists(): + try: + with open(kaggle_json) as f: + creds = json.load(f) + return creds.get("username", "") + except Exception: + pass + + raise RuntimeError( + "Kaggle username not found. Set KAGGLE_USERNAME env var " + "or ensure ~/.kaggle/kaggle.json exists with 'username' field." + ) + + +def _build_notebook_source( + text: str, + voice: str, + language: str, + model_id: str, + dtype: str, + output_filename: str, +) -> dict: + """Build the Jupyter notebook content for Kaggle GPU execution.""" + + # Language mapping (CLI code → Qwen language string) + language_map = { + "en": "English", + "zh": "Chinese", + "fr": "French", + "de": "German", + "it": "Italian", + "ja": "Japanese", + "ko": "Korean", + "pt": "Portuguese", + "ru": "Russian", + "es": "Spanish", + } + qwen_language = language_map.get(language.lower(), "English") + + install_code = textwrap.dedent("""\ + import subprocess, sys + + def pip_install(*packages): + subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *packages]) + + pip_install("transformers>=4.40.0", "accelerate>=0.27.0", "tokenizers") + pip_install("qwen-tts") + try: + pip_install("flash-attn", "--no-cache-dir") + except Exception as e: + print(f"flash-attn install skipped (non-fatal): {e}") + pip_install("soundfile") + """) + + gen_code = textwrap.dedent(f"""\ + import torch + import soundfile as sf + from qwen_tts import Qwen3TTSModel + + MODEL_ID = {json.dumps(model_id)} + OUTPUT_PATH = "/kaggle/working/{output_filename}" + + print(f"CUDA available: {{torch.cuda.is_available()}}") + if torch.cuda.is_available(): + print(f"GPU: {{torch.cuda.get_device_name(0)}}") + + print(f"Loading model: {{MODEL_ID}}") + model = Qwen3TTSModel.from_pretrained( + MODEL_ID, + device_map="cuda:0" if torch.cuda.is_available() else "cpu", + dtype=torch.float16, + ) + + print("Generating speech...") + wavs, sr = model.generate_custom_voice( + text={json.dumps(text)}, + language={json.dumps(qwen_language)}, + speaker={json.dumps(voice)}, + ) + + sf.write(OUTPUT_PATH, wavs[0], sr) + print(f"Audio saved to {{OUTPUT_PATH}}") + print(f"Sample rate: {{sr}} Hz, Duration: {{len(wavs[0])/sr:.2f}}s") + """) + + notebook = { + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3", + }, + "language_info": {"name": "python", "version": "3.10.0"}, + }, + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# VoiceGenHub — Qwen3-TTS GPU Generation\n", + f"**Model:** `{model_id}` \n", + f"**Text:** {text[:120]}{'...' if len(text) > 120 else ''} \n", + f"**Voice:** {voice} **Language:** {qwen_language}\n", + ], + }, + { + "cell_type": "code", + "execution_count": None, + "id": "install", + "metadata": {}, + "outputs": [], + "source": install_code.splitlines(keepends=True), + }, + { + "cell_type": "code", + "execution_count": None, + "id": "generate", + "metadata": {}, + "outputs": [], + "source": gen_code.splitlines(keepends=True), + }, + ], + } + return notebook + + +def _build_kernel_metadata( + username: str, kernel_slug: str, notebook_filename: str, gpu_type: str = "p100" +) -> dict: + """Build Kaggle kernel-metadata.json.""" + return { + "id": f"{username}/{kernel_slug}", + "title": f"VoiceGenHub Remote GPU ({gpu_type.upper()})", + "code_file": notebook_filename, + "language": "python", + "kernel_type": "notebook", + "is_private": True, + "enable_gpu": True, + "enable_tpu": False, + "enable_internet": True, + "dataset_sources": [], + "competition_sources": [], + "kernel_sources": [], + "model_sources": [], + } + + +def _resolve_kaggle_executable() -> str: + """ + Resolve the 'kaggle' CLI executable. + + Priority: + 1. Same directory as the current Python executable (venv Scripts/) + 2. System PATH + """ + python_dir = Path(sys.executable).parent + for candidate in ("kaggle.exe", "kaggle"): + path = python_dir / candidate + if path.exists(): + return str(path) + + # Fall back to PATH + found = shutil.which("kaggle") + if found: + return found + + raise FileNotFoundError( + "Could not find the 'kaggle' CLI executable. " + "Install it with: pip install kaggle" + ) + + +def _extract_kernel_id_from_push(push_stdout: str, fallback: str) -> str: + """ + Extract the actual kernel ID from the push output. + + The push command prints something like: + "Kernel version 1 successfully pushed. Please check progress at + https://www.kaggle.com/code/leventecsibi/my-kernel-slug" + + We parse the URL path to get the actual slug Kaggle used. + """ + import re + match = re.search(r"kaggle\.com/code/([^/\s]+/[^/\s]+)", push_stdout) + if match: + return match.group(1) + return fallback + + +def _run_cmd(args, capture=True, check=True): + """Run a shell command. Resolves 'kaggle' to the correct venv executable.""" + resolved = list(args) + if resolved and resolved[0] == "kaggle": + resolved[0] = _resolve_kaggle_executable() + logger.debug(f"Running: {' '.join(str(a) for a in resolved)}") + result = subprocess.run( + resolved, + capture_output=capture, + text=True, + check=check, + ) + return result + + +class KaggleQwenPipeline: + """ + End-to-end pipeline: generate Qwen3-TTS audio on Kaggle P100 GPU + and download the result locally. + + Workflow: + 1. Build a Jupyter notebook with the user's text/voice/model parameters. + 2. Push it to Kaggle with GPU enabled (P100). + 3. Poll until the kernel finishes. + 4. Download the output `.wav` file. + 5. Place it into a timestamped output directory. + """ + + def __init__( + self, + model_id: str = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", + dtype: str = "float16", + kernel_slug: str = _KERNEL_SLUG, + settings_path: Optional[Path] = None, + ): + self.model_id = model_id + self.dtype = dtype + self.kernel_slug = kernel_slug + self._settings = _load_settings() if settings_path is None else json.loads(Path(settings_path).read_text()) + self._timeout_minutes = self._settings.get("deployment_timeout_minutes", 60) + self._poll_interval = self._settings.get("polling_interval_seconds", 60) + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def run( + self, + text: str, + voice: str = "Ryan", + language: str = "en", + output_dir: Optional[str] = None, + output_filename: str = "qwen3_tts.wav", + gpu_type: str = "p100", # "p100" or "t4" + ) -> Path: + """ + Run the full Kaggle Qwen3-TTS pipeline. + + Args: + text: Text to synthesize. + voice: Speaker name (e.g. "Ryan", "Serena"). + language: ISO language code (e.g. "en", "zh"). + output_dir: Local directory for the downloaded audio file. + Defaults to a timestamped folder in the cwd. + output_filename: Filename for the generated audio on Kaggle. + gpu_type: Kaggle accelerator type ("p100", "t4"). + + Returns: + Path to the downloaded audio file. + """ + username = _detect_kaggle_username() + kernel_id = f"{username}/{self.kernel_slug}" + + if output_dir is None: + from datetime import datetime + output_dir = datetime.now().strftime("%Y%m%d") + + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + logger.info( + "Starting Kaggle Qwen3-TTS pipeline", + kernel_id=kernel_id, + model=self.model_id, + voice=voice, + language=language, + gpu_type=gpu_type, + ) + + # 1. Build notebook + push + with tempfile.TemporaryDirectory() as tmpdir: + notebook_filename = "qwen3_tts.ipynb" + notebook_path = Path(tmpdir) / notebook_filename + metadata_path = Path(tmpdir) / "kernel-metadata.json" + + notebook = _build_notebook_source( + text=text, + voice=voice, + language=language, + model_id=self.model_id, + dtype=self.dtype, + output_filename=output_filename, + ) + notebook_path.write_text(json.dumps(notebook, indent=2)) + + metadata = _build_kernel_metadata( + username, self.kernel_slug, notebook_filename, gpu_type=gpu_type + ) + metadata_path.write_text(json.dumps(metadata, indent=2)) + + logger.info(f"Pushing kernel to Kaggle: {kernel_id} (accelerator: {gpu_type})") + try: + # Accelerator flag to ensure correct resource allocation + acc_flag = "nvidia-p100-1" if gpu_type == "p100" else "nvidia-t4-2" + push_result = _run_cmd( + ["kaggle", "kernels", "push", "-p", tmpdir, "--accelerator", acc_flag], + capture=True, + check=True, + ) + push_out = push_result.stdout.strip() + logger.info(f"Push result: {push_out}") + + # Kaggle may create the kernel under a different slug than the + # metadata 'id' field (it slugifies the 'title' instead when + # they differ). Parse the actual URL from the push output. + actual_kernel_id = _extract_kernel_id_from_push(push_out, kernel_id) + if actual_kernel_id != kernel_id: + logger.info( + f"Kaggle resolved kernel slug: {actual_kernel_id} " + f"(metadata had: {kernel_id})" + ) + kernel_id = actual_kernel_id + except subprocess.CalledProcessError as exc: + raise RuntimeError( + f"kaggle kernels push failed (exit {exc.returncode}).\n" + f"stdout: {exc.stdout.strip()}\n" + f"stderr: {exc.stderr.strip()}" + ) from exc + + # 2. Poll until done + self._poll_until_complete(kernel_id) + + # 3. Download output + local_wav = self._download_output(kernel_id, output_path, output_filename) + + logger.info( + "Kaggle Qwen3-TTS pipeline complete", + output=str(local_wav), + ) + return local_wav + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _poll_until_complete(self, kernel_id: str) -> None: + """Poll Kaggle kernel status until it completes or times out.""" + timeout_seconds = self._timeout_minutes * 60 + elapsed = 0 + + logger.info( + f"Polling kernel status (timeout: {self._timeout_minutes}m, " + f"interval: {self._poll_interval}s)…", + kernel_id=kernel_id, + ) + + while elapsed < timeout_seconds: + try: + result = _run_cmd( + ["kaggle", "kernels", "status", kernel_id], + capture=True, + check=True, + ) + status_line = result.stdout.strip() + logger.info(f"Kernel status: {status_line}") + + status_lower = status_line.lower() + if "complete" in status_lower: + logger.info("Kernel finished successfully.") + return + elif "error" in status_lower or "cancel" in status_lower: + raise RuntimeError( + f"Kaggle kernel ended with non-successful status: {status_line}\n" + "Check the kernel logs at https://www.kaggle.com/code" + ) + except subprocess.CalledProcessError as e: + logger.warning(f"Status check failed: {e.stderr.strip()}, retrying…") + + time.sleep(self._poll_interval) + elapsed += self._poll_interval + + raise TimeoutError( + f"Kaggle kernel did not complete within {self._timeout_minutes} minutes. " + f"Check manually: https://www.kaggle.com/code/{kernel_id}" + ) + + def _download_output( + self, + kernel_id: str, + output_path: Path, + output_filename: str, + ) -> Path: + """Download kernel output and extract the audio file.""" + with tempfile.TemporaryDirectory() as dl_dir: + logger.info(f"Downloading kernel outputs from {kernel_id}…") + _run_cmd( + ["kaggle", "kernels", "output", kernel_id, "-p", dl_dir], + capture=True, + check=True, + ) + + # Kaggle downloads a zip file; find and extract it + dl_path = Path(dl_dir) + wav_files = list(dl_path.rglob("*.wav")) + zip_files = list(dl_path.rglob("*.zip")) + + # Extract zips first + for zf in zip_files: + logger.info(f"Extracting {zf.name}…") + with zipfile.ZipFile(zf, "r") as z: + z.extractall(dl_path) + wav_files = list(dl_path.rglob("*.wav")) + + if not wav_files: + # List what was downloaded for debugging + all_files = list(dl_path.rglob("*")) + file_list = ", ".join(f.name for f in all_files if f.is_file()) + raise FileNotFoundError( + f"No .wav file found in kernel output. Downloaded files: {file_list}\n" + f"Check kernel logs: https://www.kaggle.com/code/{kernel_id}" + ) + + # Find the right wav (matching output_filename if possible) + target_wav = next( + (f for f in wav_files if f.name == output_filename), + wav_files[0], + ) + + dest = output_path / output_filename + shutil.copy2(target_wav, dest) + logger.info(f"Audio saved locally: {dest}") + return dest From 56a141998563cbbce5c7f475fe4e5c5566db2266 Mon Sep 17 00:00:00 2001 From: leweex95 Date: Fri, 27 Feb 2026 08:26:32 +0100 Subject: [PATCH 06/13] rebase --- src/voicegenhub/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/voicegenhub/__init__.py b/src/voicegenhub/__init__.py index 09c55bb..a9b6692 100644 --- a/src/voicegenhub/__init__.py +++ b/src/voicegenhub/__init__.py @@ -34,6 +34,14 @@ # Additional safety for transformers os.environ['TRANSFORMERS_ATTENTION_IMPLEMENTATION'] = 'eager' +# Apply CPU compatibility patches early to prevent import-time crashes in dependencies +try: + from .utils.compatibility import apply_cpu_compatibility_patches + apply_cpu_compatibility_patches() +except Exception: + # Fail silently to avoid breaking the whole library if the tool itself has issues + pass + # Import core classes when available try: from .config.settings import Settings # noqa: F401 From 98e3ba263e5507e12d90eca0f117a6cee664e083 Mon Sep 17 00:00:00 2001 From: leweex95 Date: Fri, 27 Feb 2026 08:27:48 +0100 Subject: [PATCH 07/13] rebase --- src/voicegenhub/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/voicegenhub/__init__.py b/src/voicegenhub/__init__.py index a9b6692..d16f60c 100644 --- a/src/voicegenhub/__init__.py +++ b/src/voicegenhub/__init__.py @@ -22,12 +22,12 @@ import os -# Apply CPU compatibility patches early to prevent import-time crashes in dependencies +# Apply compatibility patches as early as possible (before providers or heavy imports) try: from .utils.compatibility import apply_cpu_compatibility_patches apply_cpu_compatibility_patches() -except Exception: - # Fail silently to avoid breaking the whole library if the tool itself has issues +except (ImportError, Exception): + # Fallback to prevent cyclic or import-time issues os.environ['TRANSFORMERS_ATTENTION_IMPLEMENTATION'] = 'eager' os.environ['CUDA_VISIBLE_DEVICES'] = '' From 1b4d93e4724c918e3511b73a520bbe0fb098911d Mon Sep 17 00:00:00 2001 From: leweex95 Date: Fri, 27 Feb 2026 08:29:35 +0100 Subject: [PATCH 08/13] rebase --- src/voicegenhub/__init__.py | 14 +++----------- src/voicegenhub/utils/compatibility.py | 7 ++++++- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/voicegenhub/__init__.py b/src/voicegenhub/__init__.py index d16f60c..09c55bb 100644 --- a/src/voicegenhub/__init__.py +++ b/src/voicegenhub/__init__.py @@ -22,26 +22,18 @@ import os -# Apply compatibility patches as early as possible (before providers or heavy imports) +# Apply CPU compatibility patches early to prevent import-time crashes in dependencies try: from .utils.compatibility import apply_cpu_compatibility_patches apply_cpu_compatibility_patches() -except (ImportError, Exception): - # Fallback to prevent cyclic or import-time issues +except Exception: + # Fail silently to avoid breaking the whole library if the tool itself has issues os.environ['TRANSFORMERS_ATTENTION_IMPLEMENTATION'] = 'eager' os.environ['CUDA_VISIBLE_DEVICES'] = '' # Additional safety for transformers os.environ['TRANSFORMERS_ATTENTION_IMPLEMENTATION'] = 'eager' -# Apply CPU compatibility patches early to prevent import-time crashes in dependencies -try: - from .utils.compatibility import apply_cpu_compatibility_patches - apply_cpu_compatibility_patches() -except Exception: - # Fail silently to avoid breaking the whole library if the tool itself has issues - pass - # Import core classes when available try: from .config.settings import Settings # noqa: F401 diff --git a/src/voicegenhub/utils/compatibility.py b/src/voicegenhub/utils/compatibility.py index 11048eb..30923a4 100644 --- a/src/voicegenhub/utils/compatibility.py +++ b/src/voicegenhub/utils/compatibility.py @@ -7,6 +7,7 @@ logger = get_logger(__name__) + def apply_cpu_compatibility_patches(): """Apply patches to ensure stability on CPU-only environments.""" @@ -29,7 +30,9 @@ def apply_cpu_compatibility_patches(): mock_codec.__version__ = "0.9.1" mock_codec.__spec__ = ModuleSpec("torchcodec", None) - class Frame: pass + class Frame: + pass + class Decoder: def __init__(self, *args, **kwargs): pass @@ -62,6 +65,7 @@ def patched_version(package_name): if "torch" in sys.modules: _patch_torch_cuda(sys.modules["torch"]) + def _patch_torch_cuda(torch): """Specific patches for torch when it's already loaded.""" if not torch.cuda.is_available(): @@ -78,6 +82,7 @@ def _patch_torch_cuda(torch): except Exception: pass + def ensure_torchcodec(): """Specific check for torchcodec to satisfy Transformers >= 4.51.""" try: From f82aa27378aa43ea808b3c1acc2290616363969a Mon Sep 17 00:00:00 2001 From: leweex95 Date: Fri, 27 Feb 2026 10:43:47 +0100 Subject: [PATCH 09/13] fixed erroneous cli exec --- README.md | 25 ++--------- src/voicegenhub/cli.py | 71 +++++++++++++++++++++++------- src/voicegenhub/kaggle/pipeline.py | 19 ++++++-- 3 files changed, 75 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 087fd3e..4dbb8b0 100644 --- a/README.md +++ b/README.md @@ -263,31 +263,17 @@ Run the full Qwen3-TTS pipeline on a **free Kaggle P100 GPU**. VoiceGenHub autom ```bash - - - - -# Basic usage — outputs to a timestamped folder (YYYYMMDD_HHMMSS) +# Basic usage — outputs to a timestamped folder (YYYYMMDD_HHMMSS_p100) poetry run voicegenhub synthesize "Hello from the Kaggle GPU!" --provider qwen --gpu p100 - - - - -> To use dual T4 GPUs, use `--gpu t4`. To force CPU, use `--cpu` (or omit both flags for default CPU mode). - - +# To use dual T4 GPUs, use --gpu t4. To force CPU, use --cpu (or omit both flags for default CPU mode). # Specify voice and language poetry run voicegenhub synthesize "This is a test." --provider qwen --voice Ryan --language en --gpu p100 - - # Chinese with native speaker poetry run voicegenhub synthesize "你好,这是一个测试。" --provider qwen --voice Serena --language zh --gpu p100 - - # Explicit output directory and filename poetry run voicegenhub synthesize "Big model test." \ --provider qwen \ @@ -296,11 +282,8 @@ poetry run voicegenhub synthesize "Big model test." \ --output-filename my_audio.wav \ --gpu p100 - - # Adjust polling timeout (default 60 min) poetry run voicegenhub synthesize "Long text..." --provider qwen --gpu p100 --timeout 90 --poll-interval 30 -| `--gpu-type` | `p100` | Kaggle GPU type: `p100` (default) or `t4` (dual T4, optional) | ``` @@ -313,7 +296,7 @@ poetry run voicegenhub synthesize "Long text..." --provider qwen --gpu p100 --ti | `--voice`, `-v` | `Ryan` | Speaker name: `Ryan`, `Serena`, etc. | | `--language`, `-l` | `en` | Language code: `en`, `zh`, `fr`, etc. | | `--model`, `-m` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | HuggingFace model ID | -| `--output-dir`, `-o` | `YYYYMMDD_HHMMSS` (current datetime) | Local folder for the downloaded audio | +| `--output-dir` | `YYYYMMDD_HHMMSS_` (current datetime) | Local folder for the downloaded audio | | `--output-filename` | `qwen3_tts.wav` | Filename for the generated audio | | `--gpu [p100|t4]` | *(optional)* | Run remotely on Kaggle GPU (specify `p100` or `t4`) | | `--cpu` | *(optional)* | Force CPU mode (default if neither flag is set) | @@ -333,7 +316,7 @@ poetry run voicegenhub synthesize "Long text..." --provider qwen --gpu p100 --ti **Note:** If you do not specify `--gpu` or `--cpu`, VoiceGenHub will run on CPU by default. For Qwen3-TTS and Chatterbox, running on CPU will print a **BIG VISIBLE WARNING** and may be extremely slow or fail. Use `--gpu p100` or `--gpu t4` for remote GPU. Use `--cpu` to force CPU mode explicitly. -**The output directory defaults to the current datetime** (e.g. `20260225_153045/qwen3_tts.wav`). +**The output directory defaults to the current datetime plus GPU type** (e.g. `20260225_153045_p100/qwen3_tts.wav`). --- diff --git a/src/voicegenhub/cli.py b/src/voicegenhub/cli.py index e5368c9..6475ce4 100644 --- a/src/voicegenhub/cli.py +++ b/src/voicegenhub/cli.py @@ -396,11 +396,46 @@ def cli(): type=str, help="Qwen 3 TTS: Reference text for voice cloning", ) +@click.option( + "--model", + "-m", + type=str, + default=None, + help="Qwen 3 TTS: HuggingFace model ID (e.g. Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)", +) +@click.option( + "--output-dir", + type=str, + default=None, + help="Kaggle GPU: Local directory for the downloaded audio (default: YYYYMMDD_HHMMSS_)", +) +@click.option( + "--output-filename", + type=str, + default="qwen3_tts.wav", + show_default=True, + help="Kaggle GPU: Filename for the generated audio file", +) +@click.option( + "--timeout", + type=int, + default=60, + show_default=True, + help="Kaggle GPU: Timeout in minutes to wait for the kernel", +) +@click.option( + "--poll-interval", + type=int, + default=60, + show_default=True, + help="Kaggle GPU: Status polling interval in seconds", +) def synthesize( texts, voice, language, output, format, rate, pitch, provider, gpu, cpu, lowpass, normalize, distortion, noise, reverb, pitch_shift, exaggeration, cfg_weight, audio_prompt, turbo, multilingual, - instruct, ref_audio, ref_text + instruct, ref_audio, ref_text, + model, output_dir, output_filename, timeout, poll_interval, ): """Generate speech from text(s). Use --gpu [p100|t4] for remote Kaggle GPU acceleration.""" # Redirect to Kaggle pipeline if --gpu is specified @@ -410,34 +445,40 @@ def synthesize( sys.exit(1) from .kaggle.pipeline import KaggleQwenPipeline - pipeline = KaggleQwenPipeline() + pipeline = KaggleQwenPipeline( + model_id=model or "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", + timeout_minutes=timeout, + poll_interval_seconds=poll_interval, + ) - # Determine output directory and filename using the requested format + # Determine output directory and filename timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") suffix = f"_{gpu}" - output_dir_name = f"{timestamp}{suffix}" - if output: + if output_dir: + # Explicit --output-dir provided + resolved_output_dir = output_dir + resolved_output_filename = output_filename + elif output: output_path = Path(output) - # If user provided a path, we use it as base but still follow the folder naming convention if it's a dir if not output_path.suffix: - output_dir = str(output_path / output_dir_name) - output_filename = "qwen3_tts.wav" + resolved_output_dir = str(output_path / f"{timestamp}{suffix}") + resolved_output_filename = output_filename else: - output_dir = str(output_path.parent / output_dir_name) - output_filename = output_path.name + resolved_output_dir = str(output_path.parent / f"{timestamp}{suffix}") + resolved_output_filename = output_path.name else: - output_dir = output_dir_name - output_filename = "qwen3_tts.wav" + resolved_output_dir = f"{timestamp}{suffix}" + resolved_output_filename = output_filename try: result_path = pipeline.run( text=texts[0], voice=voice or "Ryan", language=language or "en", - output_dir=output_dir, - output_filename=output_filename, - gpu_type=gpu, # Now using the gpu value directly as p100 or t4 + output_dir=resolved_output_dir, + output_filename=resolved_output_filename, + gpu_type=gpu, ) click.echo(f"SUCCESS: Remote audio available at: {result_path.absolute()}") return diff --git a/src/voicegenhub/kaggle/pipeline.py b/src/voicegenhub/kaggle/pipeline.py index 98827d2..ae67ef4 100644 --- a/src/voicegenhub/kaggle/pipeline.py +++ b/src/voicegenhub/kaggle/pipeline.py @@ -178,10 +178,18 @@ def pip_install(*packages): def _build_kernel_metadata( username: str, kernel_slug: str, notebook_filename: str, gpu_type: str = "p100" ) -> dict: - """Build Kaggle kernel-metadata.json.""" + """Build Kaggle kernel-metadata.json. + + IMPORTANT: The 'title' must slugify to exactly the same value as the slug + portion of the 'id' field. Kaggle derives the kernel slug from the title + (spaces→hyphens, lowercase) and ignores the 'id' slug portion on creation. + When they differ, every subsequent push hits a 409 Conflict because Kaggle + already owns the title-derived slug. Keep title = "VoiceGenHub Qwen3 TTS" + so it slugifies to "voicegenhub-qwen3-tts", matching _KERNEL_SLUG. + """ return { "id": f"{username}/{kernel_slug}", - "title": f"VoiceGenHub Remote GPU ({gpu_type.upper()})", + "title": "VoiceGenHub Qwen3 TTS", "code_file": notebook_filename, "language": "python", "kernel_type": "notebook", @@ -272,13 +280,16 @@ def __init__( dtype: str = "float16", kernel_slug: str = _KERNEL_SLUG, settings_path: Optional[Path] = None, + timeout_minutes: Optional[int] = None, + poll_interval_seconds: Optional[int] = None, ): self.model_id = model_id self.dtype = dtype self.kernel_slug = kernel_slug self._settings = _load_settings() if settings_path is None else json.loads(Path(settings_path).read_text()) - self._timeout_minutes = self._settings.get("deployment_timeout_minutes", 60) - self._poll_interval = self._settings.get("polling_interval_seconds", 60) + # CLI-provided values take precedence over settings file + self._timeout_minutes = timeout_minutes if timeout_minutes is not None else self._settings.get("deployment_timeout_minutes", 60) + self._poll_interval = poll_interval_seconds if poll_interval_seconds is not None else self._settings.get("polling_interval_seconds", 60) # ------------------------------------------------------------------ # Public API From 62995788f4145aa0be225f3bfe442158edf8dfe3 Mon Sep 17 00:00:00 2001 From: leweex95 Date: Fri, 27 Feb 2026 16:30:17 +0100 Subject: [PATCH 10/13] added autoflake to precommit hook --- .pre-commit-config.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6d08642..04e64a7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,6 +9,15 @@ repos: - id: check-merge-conflict - id: debug-statements + - repo: https://github.com/PyCQA/autoflake + rev: v2.3.1 + hooks: + - id: autoflake + args: + - --in-place + - --remove-unused-variables + - --remove-all-unused-imports + - repo: https://github.com/pycqa/flake8 rev: 7.0.0 hooks: From 556ad48d6cab307b934266c4de193657915de8b5 Mon Sep 17 00:00:00 2001 From: leweex95 Date: Fri, 27 Feb 2026 16:33:06 +0100 Subject: [PATCH 11/13] multi-sentence generation in one run + printing available voices --- 20260225/voicegenhub-qwen3-tts-gpu.log | 91 ---- README.md | 596 +++++-------------------- src/voicegenhub/cli.py | 52 ++- src/voicegenhub/kaggle/pipeline.py | 202 ++++++--- src/voicegenhub/providers/qwen.py | 106 +++-- 5 files changed, 337 insertions(+), 710 deletions(-) delete mode 100644 20260225/voicegenhub-qwen3-tts-gpu.log diff --git a/20260225/voicegenhub-qwen3-tts-gpu.log b/20260225/voicegenhub-qwen3-tts-gpu.log deleted file mode 100644 index 3f1dc52..0000000 --- a/20260225/voicegenhub-qwen3-tts-gpu.log +++ /dev/null @@ -1,91 +0,0 @@ -[{"stream_name":"stderr","time":6.465816424,"data":"0.00s - Debugger warning: It seems that frozen modules are being used, which may\n"} -,{"stream_name":"stderr","time":6.46588892,"data":"0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off\n"} -,{"stream_name":"stderr","time":6.465894936,"data":"0.00s - to python to disable frozen modules.\n"} -,{"stream_name":"stderr","time":6.465899538,"data":"0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.\n"} -,{"stream_name":"stderr","time":7.061415414,"data":"0.00s - Debugger warning: It seems that frozen modules are being used, which may\n"} -,{"stream_name":"stderr","time":7.061447949,"data":"0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off\n"} -,{"stream_name":"stderr","time":7.06145249,"data":"0.00s - to python to disable frozen modules.\n"} -,{"stream_name":"stderr","time":7.061455112,"data":"0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.\n"} -,{"stream_name":"stdout","time":13.773725016,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.4/61.4 kB 5.3 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":13.927271089,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.0/44.0 kB 3.5 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":13.976515574,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.4/61.4 kB 5.3 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":13.97657512,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.0/44.0 kB 3.5 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":14.365197171,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 63.9/63.9 kB 4.8 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":14.567451397,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 63.9/63.9 kB 4.8 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":16.304112232,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.5/113.5 kB 9.6 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":16.418073368,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.0/12.0 MB 103.9 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":16.506303775,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.5/113.5 kB 9.6 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":16.506331495,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.0/12.0 MB 103.9 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":16.574433696,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 86.6 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":16.596169453,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 566.4/566.4 kB 39.5 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":16.776377537,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 86.6 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":16.77642332,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 566.4/566.4 kB 39.5 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":33.162250396,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.4/8.4 MB 31.7 MB/s eta 0:00:00\n"} -,{"stream_name":"stdout","time":33.364164004,"data":" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.4/8.4 MB 31.7 MB/s eta 0:00:00\n"} -,{"stream_name":"stderr","time":79.42841178,"data":"2026-02-25 18:54:09.680556: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n"} -,{"stream_name":"stderr","time":79.428609104,"data":"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n"} -,{"stream_name":"stderr","time":79.428630478,"data":"E0000 00:00:1772045650.000905 23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n"} -,{"stream_name":"stderr","time":79.428635058,"data":"E0000 00:00:1772045650.052818 23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n"} -,{"stream_name":"stderr","time":79.428638808,"data":"W0000 00:00:1772045650.692242 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.428642335,"data":"W0000 00:00:1772045650.692291 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.428645726,"data":"W0000 00:00:1772045650.692294 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.428733057,"data":"W0000 00:00:1772045650.692297 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.633527785,"data":"2026-02-25 18:54:09.680556: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n"} -,{"stream_name":"stderr","time":79.633570118,"data":"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n"} -,{"stream_name":"stderr","time":79.633578121,"data":"E0000 00:00:1772045650.000905 23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n"} -,{"stream_name":"stderr","time":79.63358432,"data":"E0000 00:00:1772045650.052818 23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n"} -,{"stream_name":"stderr","time":79.633589577,"data":"W0000 00:00:1772045650.692242 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.63359445,"data":"W0000 00:00:1772045650.692291 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.633599539,"data":"W0000 00:00:1772045650.692294 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.633616186,"data":"W0000 00:00:1772045650.692297 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.633619156,"data":"\n"} -,{"stream_name":"stderr","time":79.633621351,"data":"2026-02-25 18:54:09.680556: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n"} -,{"stream_name":"stderr","time":79.63362412,"data":"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n"} -,{"stream_name":"stderr","time":79.633626418,"data":"E0000 00:00:1772045650.000905 23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n"} -,{"stream_name":"stderr","time":79.633628938,"data":"E0000 00:00:1772045650.052818 23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n"} -,{"stream_name":"stderr","time":79.633632435,"data":"W0000 00:00:1772045650.692242 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.633634944,"data":"W0000 00:00:1772045650.692291 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.633637372,"data":"W0000 00:00:1772045650.692294 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":79.633639898,"data":"W0000 00:00:1772045650.692297 23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n"} -,{"stream_name":"stderr","time":98.077741199,"data":"/bin/sh: 1: sox: not found\n"} -,{"stream_name":"stderr","time":98.084522944,"data":"/bin/sh: 1: sox: not found\n"} -,{"stream_name":"stderr","time":98.084561306,"data":"WARNING:sox:SoX could not be found!\n"} -,{"stream_name":"stderr","time":98.084569906,"data":"\n"} -,{"stream_name":"stderr","time":98.084573978,"data":" If you do not have SoX, proceed here:\n"} -,{"stream_name":"stderr","time":98.084577812,"data":" - - - http://sox.sourceforge.net/ - - -\n"} -,{"stream_name":"stderr","time":98.084581684,"data":"\n"} -,{"stream_name":"stderr","time":98.084585536,"data":" If you do (or think that you should) have SoX, double-check your\n"} -,{"stream_name":"stderr","time":98.084589885,"data":" path variables.\n"} -,{"stream_name":"stderr","time":98.084593742,"data":" \n"} -,{"stream_name":"stderr","time":98.084597023,"data":"\n"} -,{"stream_name":"stderr","time":98.084600338,"data":"/bin/sh: 1: sox: not found\n"} -,{"stream_name":"stderr","time":98.084603859,"data":"WARNING:sox:SoX could not be found!\n"} -,{"stream_name":"stderr","time":98.084607247,"data":"\n"} -,{"stream_name":"stderr","time":98.084610416,"data":" If you do not have SoX, proceed here:\n"} -,{"stream_name":"stderr","time":98.084613818,"data":" - - - http://sox.sourceforge.net/ - - -\n"} -,{"stream_name":"stderr","time":98.084618784,"data":"\n"} -,{"stream_name":"stderr","time":98.084622464,"data":" If you do (or think that you should) have SoX, double-check your\n"} -,{"stream_name":"stderr","time":98.084634034,"data":" path variables.\n"} -,{"stream_name":"stderr","time":98.084636545,"data":" \n"} -,{"stream_name":"stdout","time":99.940193512,"data":"CUDA available: True\n"} -,{"stream_name":"stdout","time":99.94023427,"data":"GPU: Tesla P100-PCIE-16GB\n"} -,{"stream_name":"stdout","time":99.940239859,"data":"Loading model: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice\n"} -,{"stream_name":"stdout","time":134.503678322,"data":"Generating speech...\n"} -,{"stream_name":"stderr","time":134.59886615,"data":"Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.\n"} -,{"stream_name":"stderr","time":134.598897308,"data":"\n"} -,{"stream_name":"stderr","time":134.598903838,"data":"Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.\n"} -,{"stream_name":"stdout","time":160.741624344,"data":"Audio saved to /kaggle/working/qwen3_tts.wav\n"} -,{"stream_name":"stdout","time":160.741672884,"data":"Sample rate: 24000 Hz, Duration: 10.00s\n"} -,{"stream_name":"stderr","time":165.862483141,"data":"/usr/local/lib/python3.12/dist-packages/mistune.py:435: SyntaxWarning: invalid escape sequence '\\|'\n"} -,{"stream_name":"stderr","time":165.86251786,"data":" cells[i][c] = re.sub('\\\\\\\\\\|', '|', cell)\n"} -,{"stream_name":"stderr","time":166.010702784,"data":"/usr/local/lib/python3.12/dist-packages/nbconvert/filters/filter_links.py:36: SyntaxWarning: invalid escape sequence '\\_'\n"} -,{"stream_name":"stderr","time":166.010732274,"data":" text = re.sub(r'_', '\\_', text) # Escape underscores in display text\n"} -,{"stream_name":"stderr","time":166.570315686,"data":"/usr/local/lib/python3.12/dist-packages/traitlets/traitlets.py:2915: FutureWarning: --Exporter.preprocessors=[\"remove_papermill_header.RemovePapermillHeader\"] for containers is deprecated in traitlets 5.0. You can pass `--Exporter.preprocessors item` ... multiple times to add items to a list.\n"} -,{"stream_name":"stderr","time":166.570350872,"data":" warn(\n"} -,{"stream_name":"stderr","time":166.596721549,"data":"[NbConvertApp] Converting notebook __notebook__.ipynb to notebook\n"} -,{"stream_name":"stderr","time":166.933120221,"data":"[NbConvertApp] Writing 147162 bytes to __notebook__.ipynb\n"} -,{"stream_name":"stderr","time":169.293420137,"data":"/usr/local/lib/python3.12/dist-packages/traitlets/traitlets.py:2915: FutureWarning: --Exporter.preprocessors=[\"nbconvert.preprocessors.ExtractOutputPreprocessor\"] for containers is deprecated in traitlets 5.0. You can pass `--Exporter.preprocessors item` ... multiple times to add items to a list.\n"} -,{"stream_name":"stderr","time":169.293456667,"data":" warn(\n"} -,{"stream_name":"stderr","time":169.314149193,"data":"[NbConvertApp] Converting notebook __notebook__.ipynb to html\n"} -,{"stream_name":"stderr","time":170.145421049,"data":"[NbConvertApp] Writing 396049 bytes to __results__.html\n"} -] diff --git a/README.md b/README.md index 4dbb8b0..a8c8ff7 100644 --- a/README.md +++ b/README.md @@ -1,562 +1,176 @@ -[![Unit tests](https://github.com/leweex95/voicegenhub/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/leweex95/voicegenhub/actions/workflows/unit-tests.yml) +[![Unit tests](https://github.com/leweex95/voicegenhub/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/leweex95/voicegenhub/actions/workflows/unit-tests.yml) [![Daily regression test](https://github.com/leweex95/voicegenhub/actions/workflows/daily-regression-test.yml/badge.svg)](https://github.com/leweex95/voicegenhub/actions/workflows/daily-regression-test.yml) [![codecov](https://codecov.io/gh/leweex95/voicegenhub/branch/master/graph/badge.svg)](https://codecov.io/gh/leweex95/voicegenhub) # VoiceGenHub -Simple, user-friendly Text-to-Speech (TTS) library with CLI and Python API. Supports multiple free and commercial TTS providers. +Simple CLI-first Text-to-Speech library supporting multiple free and commercial providers — including free Kaggle GPU inference for state-of-the-art models. -### Optional Dependencies - -- **Microsoft Edge TTS** (free, cloud-based) -- **Kokoro TTS** (Apache 2.0 licensed, self-hosted lightweight TTS) -- **Bark TTS** (MIT licensed, self-hosted high-naturalness TTS with prosody control) -- **Chatterbox TTS** (MIT licensed, multilingual with emotion control) - Works on CPU or GPU -- **Qwen 3 TTS** (Apache 2.0 licensed, multilingual with voice design and cloning) - State-of-the-art quality -- **ElevenLabs TTS** (commercial, high-quality voices) - -### Voice Cloning Support +--- -For voice cloning features with Chatterbox TTS: +## Install ```bash -pip install voicegenhub[voice-cloning] -# or -poetry install -E voice-cloning +poetry add voicegenhub ``` -**Voice cloning requirements:** -- FFmpeg (manual installation required) -- PyTorch (standard version) +--- -**On Windows:** Download the "full-shared" FFmpeg build from [ffmpeg.org](https://ffmpeg.org/download.html#build-windows) and add the `bin` directory to your system PATH. +## Providers -**Note:** VoiceGenHub includes a compatibility layer to ensure stable execution on CPU-only systems and prevents common import-time crashes related to experimental dependencies like TorchCodec. Standard TTS and voice cloning mechanisms will automatically fall back to supported audio loaders if needed. +| Provider | License | Local / Cloud | Notes | +|---|---|---|---| +| **Edge TTS** | Free (Microsoft) | Cloud | Fastest, zero setup | +| **Kokoro** | Apache 2.0 | Local | Lightweight, high quality | +| **Bark** | MIT | Local | Prosody markers, 100+ voices | +| **Chatterbox** | MIT | Local | Emotion control, voice cloning | +| **Qwen 3 TTS** | Apache 2.0 | Local / Kaggle GPU | State-of-the-art multilingual | +| **ElevenLabs** | Paid API | Cloud | Commercial-grade voices | -## Usage +--- -### Chatterbox TTS +## Synthesize -```bash -poetry run voicegenhub synthesize "Hello, world!" --provider chatterbox --voice chatterbox-default --output hello.wav -``` +### Edge TTS (fastest, no setup) -**Chatterbox features:** -- **Model selection via voice**: Choose between standard, turbo, or multilingual models using the `--voice` flag -- Emotion/intensity control with `exaggeration` parameter (0.0-1.0) -- Zero-shot voice cloning from audio samples -- MIT License - fully commercial compatible -- State-of-the-art quality (competitive with ElevenLabs) -- Built-in Perth watermarking for responsible AI - -**Chatterbox voices:** -- `chatterbox-default`: Standard English model with emotion control -- `chatterbox-turbo`: Turbo English model (faster generation, English only) -- `chatterbox-`: Multilingual model for specific languages (e.g., `chatterbox-es` for Spanish) - -**Chatterbox parameters:** -- `--exaggeration`: Emotion intensity (0.0-1.0, default 0.5). Higher values = more dramatic/emotional. -- `--cfg-weight`: Classifier-free guidance weight (0.0-1.0, default 0.5). Controls the influence of the text prompt. -- `--audio-prompt`: Path to reference audio for voice cloning (optional). -- `temperature`, `max_new_tokens`, `repetition_penalty`, `min_p`, `top_p`: Advanced generation parameters (available in Python API). - -**Multilingual Support:** -Chatterbox supports 23 languages. Use the appropriate voice for the target language: ```bash -poetry run voicegenhub synthesize "Hola, esto es una prueba de voz en español." --provider chatterbox --voice chatterbox-es --output spanish.wav +poetry run voicegenhub synthesize "Hello, world!" --provider edge --voice en-US-AriaNeural --output hello.mp3 ``` -**Chatterbox supported languages:** ar, da, de, el, en, es, fi, fr, he, hi, it, ja, ko, ms, nl, no, pl, pt, ru, sv, sw, tr, zh - -**Chatterbox Installation Requirements:** -- **TorchCodec** (optional): Required for voice cloning features. Install with `pip install torchcodec` or `poetry install -E voice-cloning`. -- **FFmpeg**: Required when TorchCodec is installed for voice cloning. On Windows, install the "full-shared" build from [ffmpeg.org](https://ffmpeg.org/download.html#build-windows) and ensure FFmpeg's `bin` directory is in your system PATH. -- **PyTorch Compatibility**: TorchCodec 0.9.1 requires PyTorch ≤ 2.4.x. If you have a newer PyTorch version, voice cloning will be automatically disabled with a fallback to standard TTS. -- Without TorchCodec/FFmpeg, basic TTS will work but voice cloning (`--audio-prompt`) will gracefully fall back to standard TTS without cloning. - -### Qwen 3 TTS +### Kokoro ```bash -poetry run voicegenhub synthesize "Hello, world!" --provider qwen --voice Ryan --output hello.wav +poetry run voicegenhub synthesize "Hello, world!" --provider kokoro --voice kokoro-af_alloy --output hello.wav ``` -**Qwen 3 TTS features:** -- **Three generation modes**: CustomVoice (predefined speakers), VoiceDesign (natural language voice description), VoiceClone (reference audio-based) -- **10 languages**: Chinese, English, French, German, Italian, Japanese, Korean, Portuguese, Russian, Spanish -- **Native speakers**: Automatic selection of native speakers per language for natural, accent-free speech -- **Voice control via natural language**: Use `instruct` parameter to control emotion, tone, speaking rate, and style -- **Ultra-low latency**: Streaming generation with <100ms first-token latency -- **Apache 2.0 License**: Fully commercial compatible -- **State-of-the-art quality**: Competitive with ElevenLabs, developed by Alibaba's Qwen team - -#### Mode 1: CustomVoice (Predefined Speakers) - -Use predefined premium speakers with optional emotion/style control: +### Bark ```bash -# Basic usage with auto-selected native speaker -poetry run voicegenhub synthesize "Hello, this is a test." --provider qwen --language en --output output.wav - -# Explicit speaker selection -poetry run voicegenhub synthesize "Hello, this is a test." --provider qwen --language en --voice Ryan --output output.wav - -# With emotion instruction -poetry run voicegenhub synthesize "I'm so excited about this news!" --provider qwen --language en --voice Ryan --instruct "Speak with excitement and joy" --output happy.wav -``` - -**Available speakers and their native languages:** - -| Speaker | Description | Native Language | Best For | -|---------|-------------|----------------|----------| -| **Ryan** | Dynamic male voice with strong rhythmic drive | English | English content, presentations | -| **Aiden** | Sunny American male voice with clear midrange | English | English content, narration | -| **Vivian** | Bright, slightly edgy young female voice | Chinese | Mandarin content, audiobooks | -| **Serena** | Warm, gentle young female voice | Chinese | Mandarin content, customer service | -| **Uncle_Fu** | Seasoned male voice with low, mellow timbre | Chinese | Mandarin narration, mature content | -| **Dylan** | Youthful Beijing male voice, natural timbre | Chinese (Beijing) | Beijing dialect content | -| **Eric** | Lively Chengdu male voice, slightly husky | Chinese (Sichuan) | Sichuan dialect content | -| **Ono_Anna** | Playful Japanese female, light and nimble | Japanese | Japanese content, anime | -| **Sohee** | Warm Korean female with rich emotion | Korean | Korean content, storytelling | - -**Auto-speaker selection:** If no speaker is specified, Qwen 3 TTS automatically selects a native speaker based on the target language (e.g., Ryan for English, Serena for Chinese). - -**Emotion and style control:** Use the `--instruct` parameter with natural language to control voice characteristics: -- `"Speak with excitement and joy"` -- `"Very angry tone"` -- `"Whisper gently"` -- `"Speak slowly and calmly"` -- `"Energetic and enthusiastic"` - -#### Mode 2: VoiceDesign (Natural Language Voice Description) - -Design custom voices using natural language instructions (requires `Qwen3-TTS-VoiceDesign` model): - -```python -from voicegenhub.providers.factory import provider_factory -from voicegenhub.providers.base import TTSRequest - -config = { - "model_name_or_path": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", - "generation_mode": "voice_design", -} - -await provider_factory.discover_provider("qwen") -provider = await provider_factory.create_provider("qwen", config=config) - -request = TTSRequest( - text="Welcome to our demonstration.", - language="en", - voice_id="default", - extra_params={ - "instruct": "Male, 30 years old, confident and professional tone, deep voice with clear articulation" - } -) -response = await provider.synthesize(request) -``` - -**VoiceDesign instruction examples:** -- `"Female, 25 years old, cheerful and energetic, slightly high-pitched with playful intonation"` -- `"Male, 17 years old, gaining confidence, deeper breath support, vowels tighten when nervous"` -- `"Elderly male, 70 years old, wise and gentle, slightly raspy with warm timbre"` - -#### Mode 3: VoiceClone (Reference Audio-Based) - -Clone voices from 3-second audio samples (requires `Qwen3-TTS-Base` model): - -```python -from voicegenhub.providers.factory import provider_factory -from voicegenhub.providers.base import TTSRequest - -config = { - "model_name_or_path": "Qwen/Qwen3-TTS-12Hz-1.7B-Base", - "generation_mode": "voice_clone", -} - -await provider_factory.discover_provider("qwen") -provider = await provider_factory.create_provider("qwen", config=config) - -request = TTSRequest( - text="This is synthesized using the cloned voice.", - language="en", - voice_id="default", - extra_params={ - "ref_audio": "path/to/reference.wav", # Can be local path, URL, or numpy array - "ref_text": "Transcript of the reference audio", # Required for best quality - "x_vector_only_mode": False # Set True to skip ref_text (lower quality) - } -) -response = await provider.synthesize(request) +poetry run voicegenhub synthesize "Hello, world!" --provider bark --voice bark-en_speaker_0 --output hello.wav ``` -**Voice cloning tips:** -- Use clear, noise-free reference audio (3-10 seconds) -- Provide accurate transcript (`ref_text`) for best cloning quality -- Supports multilingual cloning (clone any language, synthesize in any language) -- Combine with VoiceDesign to create reusable custom voices - -#### Word Emphasis and Pause Control +Bark supports prosody markers: `[laughs]`, `[sighs]`, `[pause]`, `[whisper]`. -**Note:** Qwen 3 TTS does not support explicit word-level emphasis markup (like SSML tags) or pause control. Instead, the model intelligently interprets text and applies natural prosody based on: +### Chatterbox (emotion control + voice cloning) -1. **Context understanding**: The model reads the entire sentence and applies appropriate emphasis to important words automatically -2. **Natural language instructions**: Use the `instruct` parameter to guide overall tone and pacing: - - `"Speak slowly with emphasis on key words"` - - `"Pause dramatically between sentences"` - - `"Fast-paced and energetic delivery"` -3. **Punctuation**: The model respects punctuation for natural pauses (commas, periods, ellipses, em-dashes) - -**Example:** ```bash -# The model will naturally emphasize "incredible results" due to context -poetry run voicegenhub synthesize "We achieved incredible results!" --provider qwen --voice Ryan --instruct "Speak with excitement and emphasis" --output emphasized.wav -``` - -#### Model Selection - -Qwen 3 TTS offers multiple models optimized for different use cases: +# Basic +poetry run voicegenhub synthesize "Hello, world!" --provider chatterbox --voice chatterbox-default --output hello.wav -| Model | Size | Best For | Streaming | GPU Recommended |Supports | -|-------|------|----------|-----------|-----------------|---------| -| `Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice` | 600M | Default, fast generation, predefined speakers | ✅ | Optional | CustomVoice | -| `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | 1.7B | Higher quality, predefined speakers | ✅ | Yes | CustomVoice | -| `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | 1.7B | Custom voice design via natural language | ✅ | Yes | VoiceDesign | -| `Qwen/Qwen3-TTS-12Hz-1.7B-Base` | 1.7B | Voice cloning from audio samples | ✅ | Yes | VoiceClone | -| `Qwen/Qwen3-TTS-12Hz-0.6B-Base` | 600M | Voice cloning, faster generation | ✅ | Optional | VoiceClone | +# With emotion intensity +poetry run voicegenhub synthesize "This is incredible!" --provider chatterbox --voice chatterbox-default --exaggeration 0.8 --output excited.wav -**Installation:** -```bash -pip install voicegenhub[qwen] -# or -poetry install --with qwen +# Voice cloning from a reference file +poetry run voicegenhub synthesize "Hello, cloned voice." --provider chatterbox --voice chatterbox-default --audio-prompt reference.wav --output cloned.wav ``` -**Qwen 3 TTS parameters (Python API):** -- `model_name_or_path`: Model to use (see table above) -- `device`: "cuda", "cpu", or "auto" (default: auto) -- `dtype`: "float32", "float16", "bfloat16" (default: bfloat16) -- `attn_implementation`: "eager", "sdpa", "flash_attention_2" (default: eager) -- `generation_mode`: "custom_voice", "voice_design", "voice_clone" -- `speaker`: Speaker name for CustomVoice mode -- `instruct`: Emotion/style instruction (for CustomVoice) or voice description (for VoiceDesign) -- `temperature`, `top_p`, `top_k`, `repetition_penalty`, `max_new_tokens`: Advanced sampling parameters - -### Qwen3-TTS on Kaggle P100 GPU - -Run the full Qwen3-TTS pipeline on a **free Kaggle P100 GPU**. VoiceGenHub automatically pushes a notebook to Kaggle, runs it with GPU acceleration, polls for completion, and downloads the audio to a local timestamped folder — no Kaggle web UI interaction required. - -#### Prerequisites - -1. **Install the Kaggle CLI:** - ```bash - pip install kaggle - ``` - -2. **Set up Kaggle API credentials** (`~/.kaggle/kaggle.json`): - - Go to https://www.kaggle.com/settings → API → Create New Token - - Save the downloaded `kaggle.json` to `~/.kaggle/kaggle.json` - - On Windows: `%USERPROFILE%\.kaggle\kaggle.json` - -3. **Enable internet on Kaggle notebooks** (required for `pip install`): - - Kaggle by default allows internet access from notebooks (no action needed). - -#### Usage - +### ElevenLabs ```bash -# Basic usage — outputs to a timestamped folder (YYYYMMDD_HHMMSS_p100) -poetry run voicegenhub synthesize "Hello from the Kaggle GPU!" --provider qwen --gpu p100 - -# To use dual T4 GPUs, use --gpu t4. To force CPU, use --cpu (or omit both flags for default CPU mode). - -# Specify voice and language -poetry run voicegenhub synthesize "This is a test." --provider qwen --voice Ryan --language en --gpu p100 - -# Chinese with native speaker -poetry run voicegenhub synthesize "你好,这是一个测试。" --provider qwen --voice Serena --language zh --gpu p100 - -# Explicit output directory and filename -poetry run voicegenhub synthesize "Big model test." \ - --provider qwen \ - --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --output-dir 20260225_153045 \ - --output-filename my_audio.wav \ - --gpu p100 - -# Adjust polling timeout (default 60 min) -poetry run voicegenhub synthesize "Long text..." --provider qwen --gpu p100 --timeout 90 --poll-interval 30 -``` - - -#### All `synthesize` flags for Kaggle GPU - -| Flag | Default | Description | -|------|---------|-------------| -| `TEXT` | *(required)* | Text to synthesize | -| `--provider` | *(required)* | TTS provider: `qwen`, `chatterbox`, etc. | -| `--voice`, `-v` | `Ryan` | Speaker name: `Ryan`, `Serena`, etc. | -| `--language`, `-l` | `en` | Language code: `en`, `zh`, `fr`, etc. | -| `--model`, `-m` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | HuggingFace model ID | -| `--output-dir` | `YYYYMMDD_HHMMSS_` (current datetime) | Local folder for the downloaded audio | -| `--output-filename` | `qwen3_tts.wav` | Filename for the generated audio | -| `--gpu [p100|t4]` | *(optional)* | Run remotely on Kaggle GPU (specify `p100` or `t4`) | -| `--cpu` | *(optional)* | Force CPU mode (default if neither flag is set) | -| `--timeout` | `60` | Timeout in minutes to wait for the kernel | -| `--poll-interval` | `60` | Status polling interval in seconds | - -#### How it works - -1. **Build** — VoiceGenHub generates a Jupyter notebook with your text, voice, and model parameters. -2. **Push** — The notebook is pushed to Kaggle with `enable_gpu: true` (P100). -3. **Run** — Kaggle executes the notebook: installs `qwen-tts`, loads the model on the GPU, generates audio. -4. **Poll** — VoiceGenHub polls `kaggle kernels status` every 60 seconds until completion. -5. **Download** — The `.wav` file is fetched with `kaggle kernels output` and placed in your local output directory. - - - - -**Note:** If you do not specify `--gpu` or `--cpu`, VoiceGenHub will run on CPU by default. For Qwen3-TTS and Chatterbox, running on CPU will print a **BIG VISIBLE WARNING** and may be extremely slow or fail. Use `--gpu p100` or `--gpu t4` for remote GPU. Use `--cpu` to force CPU mode explicitly. - -**The output directory defaults to the current datetime plus GPU type** (e.g. `20260225_153045_p100/qwen3_tts.wav`). - ---- - -## ⚠️ IMPORTANT: GPU Requirement for Qwen3/Chatterbox - -**Qwen3-TTS and Chatterbox require a GPU for practical generation speed.** - -- If you run these providers **without** `--gpu` (or on a CPU-only machine), you will see a **BIG WARNING** and generation will be extremely slow or may fail. -- Always use `--gpu` for Qwen3 and Chatterbox unless you are on a local machine with a powerful GPU. - -**Example warning:** - -``` -WARNING: Qwen3-TTS and Chatterbox require a GPU for fast generation. Use --gpu (and optionally --gpu-type) to run on Kaggle or your local GPU. CPU-only runs are not recommended and may fail. +poetry run voicegenhub synthesize "Hello, world!" --provider elevenlabs --voice elevenlabs-EXAVITQu4vr4xnSDxMaL --output hello.mp3 ``` -#### Available Qwen3-TTS Models on Kaggle GPU - -| Model | Size | Speed | Best For | -|-------|------|-------|----------| -| `Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice` | 600M | Fast | Quick iterations | -| `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | 1.7B | Normal | **Best quality** ✅ | - -> **Tip:** The 1.7B model is recommended for production quality. A P100 has 16 GB VRAM — more than enough for the 1.7B model at float16. +Store your API key in `config/elevenlabs-api-key.json` as `{"ELEVENLABS_API_KEY": "..."}`. --- -### Bark +## Qwen 3 TTS ```bash -poetry run voicegenhub synthesize "Hello, world!" --provider bark --voice bark-en_speaker_0 --output hello.wav -``` +poetry run voicegenhub synthesize "Hello from the GPU!" --provider qwen --voice Ryan --language en --gpu p100 -**Bark features:** -- Highest naturalness among open-source TTS -- Prosody markers for emotional expression: `[laughs]`, `[sighs]`, `[pause]`, `[whisper]` -- 100+ speaker presets -- Sound effects generation +# Batch multiple sentences in one GPU job, saved as audio_001.wav … audio_007.wav + manifest.json +poetry run voicegenhub synthesize \ + "The quick brown fox jumps over the lazy dog." \ + "Technology is changing the world at an unprecedented pace." \ + "The sunset painted the sky in shades of orange and pink." \ + --provider qwen --voice Ryan --language en --gpu p100 -**Bark supported voices:** Use preset names like `bark-en_speaker_0`, `bark-en_speaker_1`, etc. +# Chinese with native speaker +poetry run voicegenhub synthesize "你好,这是一个测试。" --provider qwen --voice Serena --language zh --gpu p100 -### Edge TTS +# Use T4 GPUs instead of P100 +poetry run voicegenhub synthesize "Hello!" --provider qwen --gpu t4 -```bash -poetry run voicegenhub synthesize "Hello, world!" --provider edge --voice en-US-AriaNeural --output hello.mp3 +# Custom model, output directory, polling options +poetry run voicegenhub synthesize "Hello!" \ + --provider qwen \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ + --output-dir my_output \ + --gpu p100 \ + --timeout 90 \ + --poll-interval 30 ``` -**Edge TTS supported voices:** Check the list of supported voices [here](https://speech.microsoft.com/portal/voicegallery). +Batch output lands in a timestamped folder (e.g. `20260227_123130_p100/`) with: +- `audio_001.wav`, `audio_002.wav`, … (one per input sentence) +- `manifest.json` — maps each filename to its source text and duration -### Kokoro TTS - -```bash -poetry run voicegenhub synthesize "Hello, world!" --provider kokoro --voice kokoro-af_alloy --output hello.wav -``` +--- -**Kokoro supported voices:** Check the list of supported voices [here](https://github.com/nazdridoy/kokoro-tts?tab=readme-ov-file#supported-voices). +## Batch Processing (local providers) -### ElevenLabs +Pass multiple texts to any provider — processed concurrently with shared model instances: ```bash -poetry run voicegenhub synthesize "Hello, world!" --provider elevenlabs --voice elevenlabs-EXAVITQu4vr4xnSDxMaL --output hello.mp3 +poetry run voicegenhub synthesize "First." "Second." "Third." --provider edge --output batch_output ``` -Set your API key in `config/elevenlabs-api-key.json` (the key should be stored as the value for `"ELEVENLABS_API_KEY"` in the JSON file). - -**ElevenLabs supported voices:** Check the list of supported voices [here](https://elevenlabs.io/docs/voices). +--- -## Print all available voices per provider +## Voices ```bash -poetry run voicegenhub voices --language en --provider chatterbox -poetry run voicegenhub voices --language en --provider bark poetry run voicegenhub voices --language en --provider edge poetry run voicegenhub voices --language en --provider kokoro +poetry run voicegenhub voices --language en --provider bark +poetry run voicegenhub voices --language en --provider chatterbox poetry run voicegenhub voices --language en --provider elevenlabs +poetry run voicegenhub voices --provider qwen ``` -## Batch Processing with Concurrency Control - -Process multiple texts concurrently with automatic provider-specific resource management: - -```bash -# Process multiple texts (auto-numbered output files) -poetry run voicegenhub synthesize "First text" "Second text" "Third text" --provider edge --output batch_output - -# Control concurrency (auto-configured per provider if not specified) -poetry run voicegenhub synthesize "Text 1" "Text 2" --provider bark --max-concurrent 2 --output output -``` - -**Provider Concurrency Limits (automatic):** -- **Fast providers** (Edge, Kokoro, ElevenLabs): Use all CPU cores -- **Heavy providers** (Bark: 2 concurrent, Chatterbox: 1 concurrent) - -**Benefits:** -- Model instances are shared across concurrent jobs (no reloading) -- Automatic resource management prevents system overload -- Progress tracking for each job -- Failed jobs don't stop the batch - -## Voice Cloning with Kokoro and Chatterbox - -VoiceGenHub supports zero-shot voice cloning by combining Kokoro's lightweight voices with Chatterbox's advanced cloning capabilities. This allows you to create custom voices that sound like Kokoro but with Chatterbox's superior quality and emotion control. - -### Step-by-Step Guide - -1. **Generate a Kokoro voice sample** (modify as desired or keep undistorted): - ```bash - # Undistorted voice - poetry run voicegenhub synthesize "Sample text for cloning." --provider kokoro --voice kokoro-am_michael --output reference.wav --format wav - - # Or with effects (e.g., horror/distortion) - poetry run voicegenhub synthesize "Sample text for cloning." --provider kokoro --voice kokoro-am_adam --output reference.wav --format wav --pitch-shift -2 --distortion 0.02 --lowpass 2000 --normalize - ``` - -2. **Clone the voice with Chatterbox**: - ```bash - poetry run voicegenhub synthesize "Your longer text here." --provider chatterbox --voice chatterbox-default --output cloned_voice.wav --audio-prompt reference.wav - ``` - -3. **Optional: Adjust emotion and style**: - ```bash - poetry run voicegenhub synthesize "Your text." --provider chatterbox --voice chatterbox-default --output cloned_voice.wav --audio-prompt reference.wav --exaggeration 0.8 --cfg-weight 0.7 - ``` - -**Tips:** -- Use short, clear reference audio (5-10 seconds) for best cloning results -- Combine multiple Kokoro samples with FFmpeg for richer voice profiles -- Experiment with Kokoro effects to create unique voice characteristics before cloning -- Chatterbox supports multilingual cloning from any language reference audio +### Qwen speakers -## Concurrency and Memory Management +| Speaker | Gender | Native language | Notes | +|---|---|---|---| +| `Ryan` | Male | English | Dynamic, rhythmic — works for all languages | +| `Aiden` | Male | English | Sunny American voice | +| `Vivian` | Female | Chinese | Bright, slightly edgy | +| `Serena` | Female | Chinese | Warm, gentle | +| `Uncle_Fu` | Male | Chinese | Low, mellow timbre | +| `Dylan` | Male | Chinese (Beijing) | Natural, youthful | +| `Eric` | Male | Chinese (Sichuan) | Slightly husky | +| `Ono_Anna` | Female | Japanese | Playful, nimble | +| `Sohee` | Female | Korean | Warm, emotional | -**Async Concurrency (Recommended):** -- Use the `synthesize` command with multiple texts for safe concurrent processing within a single process -- Models are loaded once and shared across concurrent jobs -- Prevents out-of-memory (OOM) errors from duplicate model loading -- Automatic provider-specific limits ensure stability - -**Multiprocessing Risks:** -- Running multiple CLI processes simultaneously (e.g., via scripts or parallel jobs) loads separate model instances -- Heavy models like Chatterbox (3.7GB) and Bark (4GB) can cause OOM when duplicated across processes -- **Recommendation:** Use async batch processing instead of multiprocessing for heavy providers -- For light providers (Edge, Kokoro), multiprocessing is safer due to minimal memory footprint - -## Performance Comparison: All TTS Providers - -Here's how all providers compare in terms of speed and quality: - -| Provider | Quality (MOS) | Startup Time | Sequential (per req) | Async (3x parallel) | Model Size | Commercial Licensed | -|----------|---------------|--------------|---------------------|-------------------|------------|----------------| -| **Edge TTS** | 3.8/5 | 4.9s | 3.2s | 2.5s | 0MB (cloud) | ✅ Free | -| **Kokoro** | 3.5/5 | 94s | 14.2s | 2.5s | 625MB | ✅ Apache 2.0 | -| **Bark** | 4.2/5 | 180s | 25-40s | 8-12s | 4GB | ✅ MIT | -| **Chatterbox** | 4.3/5 | 120s | 15-30s | 5-15s | 3.7GB | ✅ MIT | -| **ElevenLabs** | 4.5/5* | 2s | 3-5s | 2-3s | 0MB (cloud) | ⚠️ Paid API | - -*ElevenLabs quality estimate based on provider reputation; not yet tested with API key. - -**Key Findings:** -- **Chatterbox**: Excellent quality with emotion control and multilingual support; MIT licensed, works on CPU -- **Bark**: Highest naturalness for premium narration; MIT licensed (full commercial freedom) -- **Kokoro**: Best balance of quality vs speed for offline use; Apache 2.0 licensed -- **Edge TTS**: Best for real-time, low-latency applications; cloud-based (Microsoft) -- **ElevenLabs**: Highest quality but requires paid API and credit card -- **For commercial purposes:** Use Bark (MIT), Chatterbox (MIT), or Kokoro (Apache 2.0) - -## Chatterbox Concurrency Analysis - -**Memory Safety**: Chatterbox uses a **shared model instance** (3.6GB) across all threads - **no duplication**. Safe to use 2-8 concurrent threads without OOM risk. - -**Performance**: ~2.8x speedup at 4 threads on CPU. Optimal thread count: **2-4 threads**. - -**[View Interactive Performance Analysis](assets/concurrency_plot.html)** - Shows speedup curves, memory usage, and timing breakdowns. - -## Commercial Licensing - -### ✅ Commercially Safe Models: -- **Bark** (MIT License) - Unrestricted commercial use, no attribution required ⭐ -- **Chatterbox** (MIT License) - Unrestricted commercial use, no attribution required -- **Qwen 3 TTS** (Apache 2.0) - Commercial use allowed, attribution required -- **Kokoro** (Apache 2.0) - Commercial use allowed, attribution required -- **Edge TTS** (Microsoft) - Commercial use allowed -- **ElevenLabs** (Paid API) - Commercial use with valid subscription - -## Provider Licenses - -For transparency and compliance, here are direct links to the official license terms for each supported TTS provider: - -- **Edge TTS (Microsoft)**: [Microsoft Terms of Use](https://www.microsoft.com/en-us/legal/terms-of-use) -- **Kokoro TTS**: [Apache License 2.0](https://github.com/hexgrad/kokoro/blob/main/LICENSE) -- **ElevenLabs TTS**: [ElevenLabs Terms of Service](https://elevenlabs.io/terms) -- **Bark TTS**: [MIT License](https://github.com/suno-ai/bark/blob/main/LICENSE) -- **Chatterbox TTS**: [MIT License](https://github.com/rsxdalv/chatterbox/blob/main/LICENSE) -- **Qwen 3 TTS**: [Apache License 2.0](https://github.com/QwenLM/Qwen3-TTS/blob/main/LICENSE) - -## Optional Dependencies - -Install optional TTS providers: - -```bash -# Install Kokoro TTS (self-hosted lightweight TTS) -pip install voicegenhub[kokoro] - -# Install Bark (self-hosted high-naturalness TTS) -pip install voicegenhub[bark] - -# Install Chatterbox TTS (MIT licensed, multilingual with emotion control) -pip install chatterbox-tts - -# Install Qwen 3 TTS (Apache 2.0 licensed, state-of-the-art multilingual TTS) -pip install voicegenhub[qwen] -``` - -### Kokoro TTS Installation -Kokoro TTS requires Python 3.11 or higher. - -#### Windows & Python 3.13+ Build Limitation - -**Important:** On Windows with Python 3.13+, Kokoro TTS (via curated-tokenizers) may require compiling native code if pre-built wheels are not available. This requires Microsoft Visual C++ Build Tools. - -If you see errors about missing C++ compilers or build failures when installing Kokoro, follow these steps: - -1. Download and install [Microsoft Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/). -2. During installation, select "Desktop development with C++" workload. -3. After installation, restart your terminal and retry installation: - ```bash - poetry install --with kokoro - # or - pip install voicegenhub[kokoro] - ``` +--- -If you still see build errors, check for available wheels for `curated-tokenizers` on [PyPI](https://pypi.org/project/curated-tokenizers/#files). If no wheel is available for your Python version, you must build from source (requires Visual C++). +## Key `synthesize` options + +| Flag | Description | +|---|---| +| `TEXT ...` | One or more texts to synthesize | +| `--provider` | TTS provider (`edge`, `kokoro`, `bark`, `chatterbox`, `qwen`, `elevenlabs`) | +| `--voice`, `-v` | Voice / speaker name | +| `--language`, `-l` | Language code (`en`, `zh`, `fr`, ...) | +| `--output`, `-o` | Output file or directory | +| `--gpu [p100|t4]` | Run on free Kaggle GPU (Qwen) | +| `--model`, `-m` | HuggingFace model ID override | +| `--output-dir` | Local directory for Kaggle batch output | +| `--timeout` | Kaggle polling timeout in minutes (default 60) | +| `--poll-interval` | Kaggle polling interval in seconds (default 60) | +| `--seed` | Random seed for reproducible generation (default 42) | +| `--temperature` | Sampling temperature (lower = more neutral, higher = more expressive; default 0.7) | +| `--exaggeration` | Chatterbox emotion intensity 0–1 | +| `--audio-prompt` | Reference audio for voice cloning (Chatterbox) | -**Recommendation:** For easiest installation, use Python 3.11 or 3.12 on Windows until wheels for Python 3.13+ are published. +--- -#### Installation +## Docs -```bash -# Using Poetry (recommended): -poetry add voicegenhub[kokoro] -# or: -poetry install --with kokoro -``` +- [Installation & optional dependencies](docs/installation.md) +- [Provider details & voice lists](docs/providers.md) +- [Kaggle GPU setup](docs/kaggle_gpu.md) +- [Voice cloning & design](docs/cloning_and_design.md) +- [Benchmarks & performance](docs/benchmarks_and_performance.md) +- [Licensing](docs/licensing.md) diff --git a/src/voicegenhub/cli.py b/src/voicegenhub/cli.py index 6475ce4..e2a5155 100644 --- a/src/voicegenhub/cli.py +++ b/src/voicegenhub/cli.py @@ -288,7 +288,6 @@ async def generate(): @click.group() def cli(): """VoiceGenHub - Simple Text-to-Speech CLI.""" - pass @cli.command() @@ -430,20 +429,30 @@ def cli(): show_default=True, help="Kaggle GPU: Status polling interval in seconds", ) +@click.option( + "--seed", + type=int, + default=42, + show_default=True, + help="Kaggle GPU: Random seed for reproducible generation", +) +@click.option( + "--temperature", + type=float, + default=0.7, + show_default=True, + help="Kaggle GPU: Sampling temperature (lower = more stable/neutral tone, higher = more expressive)", +) def synthesize( texts, voice, language, output, format, rate, pitch, provider, gpu, cpu, lowpass, normalize, distortion, noise, reverb, pitch_shift, exaggeration, cfg_weight, audio_prompt, turbo, multilingual, instruct, ref_audio, ref_text, - model, output_dir, output_filename, timeout, poll_interval, + model, output_dir, output_filename, timeout, poll_interval, seed, temperature, ): """Generate speech from text(s). Use --gpu [p100|t4] for remote Kaggle GPU acceleration.""" # Redirect to Kaggle pipeline if --gpu is specified if gpu: - if len(texts) > 1: - click.echo("Error: --gpu currently supports only single text generation", err=True) - sys.exit(1) - from .kaggle.pipeline import KaggleQwenPipeline pipeline = KaggleQwenPipeline( model_id=model or "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", @@ -451,36 +460,33 @@ def synthesize( poll_interval_seconds=poll_interval, ) - # Determine output directory and filename + # Determine output directory timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") suffix = f"_{gpu}" - if output_dir: - # Explicit --output-dir provided resolved_output_dir = output_dir - resolved_output_filename = output_filename elif output: - output_path = Path(output) - if not output_path.suffix: - resolved_output_dir = str(output_path / f"{timestamp}{suffix}") - resolved_output_filename = output_filename - else: - resolved_output_dir = str(output_path.parent / f"{timestamp}{suffix}") - resolved_output_filename = output_path.name + output_path_obj = Path(output) + resolved_output_dir = str(output_path_obj if not output_path_obj.suffix else output_path_obj.parent / f"{timestamp}{suffix}") else: resolved_output_dir = f"{timestamp}{suffix}" - resolved_output_filename = output_filename try: - result_path = pipeline.run( - text=texts[0], + result_paths = pipeline.run( + texts=list(texts), voice=voice or "Ryan", language=language or "en", output_dir=resolved_output_dir, - output_filename=resolved_output_filename, gpu_type=gpu, + seed=seed, + temperature=temperature, ) - click.echo(f"SUCCESS: Remote audio available at: {result_path.absolute()}") + click.echo(f"SUCCESS: {len(result_paths)} audio file(s) in: {Path(resolved_output_dir).absolute()}") + for p in result_paths: + click.echo(f" {p.name}") + manifest = Path(resolved_output_dir) / "manifest.json" + if manifest.exists(): + click.echo(" manifest.json (prompt→file mapping)") return except Exception as e: click.echo(f"Error during remote generation: {e}", err=True) @@ -597,7 +603,7 @@ def synthesize( def voices(language: Optional[str], format: str, provider: str): """List available voices.""" # Validate provider immediately - supported_providers = ["edge", "kokoro", "elevenlabs", "bark", "chatterbox"] + supported_providers = ["edge", "kokoro", "elevenlabs", "bark", "chatterbox", "qwen"] if provider and provider not in supported_providers: click.echo( f"Error: Unsupported provider '{provider}'. Supported providers: {', '.join(supported_providers)}", diff --git a/src/voicegenhub/kaggle/pipeline.py b/src/voicegenhub/kaggle/pipeline.py index ae67ef4..8224f94 100644 --- a/src/voicegenhub/kaggle/pipeline.py +++ b/src/voicegenhub/kaggle/pipeline.py @@ -61,14 +61,19 @@ def _detect_kaggle_username() -> str: def _build_notebook_source( - text: str, + texts: list, voice: str, language: str, model_id: str, dtype: str, - output_filename: str, + seed: int = 42, + temperature: float = 0.7, ) -> dict: - """Build the Jupyter notebook content for Kaggle GPU execution.""" + """Build the Jupyter notebook content for Kaggle GPU batch execution. + + Generates one audio file per text entry (audio_001.wav, audio_002.wav, …) + and writes a manifest.json that maps each filename to its source text. + """ # Language mapping (CLI code → Qwen language string) language_map = { @@ -100,17 +105,32 @@ def pip_install(*packages): pip_install("soundfile") """) + # Embed the texts list and metadata directly into the notebook cell gen_code = textwrap.dedent(f"""\ + import json import torch import soundfile as sf from qwen_tts import Qwen3TTSModel - MODEL_ID = {json.dumps(model_id)} - OUTPUT_PATH = "/kaggle/working/{output_filename}" + MODEL_ID = {json.dumps(model_id)} + VOICE = {json.dumps(voice)} + LANGUAGE = {json.dumps(qwen_language)} + TEXTS = {json.dumps(texts)} + OUTPUT_DIR = "/kaggle/working" + SEED = {seed} + TEMPERATURE = {temperature} + + # Pin global seed for reproducibility across runs + torch.manual_seed(SEED) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(SEED) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False print(f"CUDA available: {{torch.cuda.is_available()}}") if torch.cuda.is_available(): print(f"GPU: {{torch.cuda.get_device_name(0)}}") + print(f"Seed: {{SEED}} Temperature: {{TEMPERATURE}}") print(f"Loading model: {{MODEL_ID}}") model = Qwen3TTSModel.from_pretrained( @@ -119,18 +139,38 @@ def pip_install(*packages): dtype=torch.float16, ) - print("Generating speech...") - wavs, sr = model.generate_custom_voice( - text={json.dumps(text)}, - language={json.dumps(qwen_language)}, - speaker={json.dumps(voice)}, - ) - - sf.write(OUTPUT_PATH, wavs[0], sr) - print(f"Audio saved to {{OUTPUT_PATH}}") - print(f"Sample rate: {{sr}} Hz, Duration: {{len(wavs[0])/sr:.2f}}s") + manifest = [] + for i, text in enumerate(TEXTS, start=1): + filename = f"audio_{{i:03d}}.wav" + out_path = f"{{OUTPUT_DIR}}/{{filename}}" + print(f"[{{i}}/{{len(TEXTS)}}] Generating: {{text[:80]}}") + # Re-seed before each text so every audio is independently reproducible + torch.manual_seed(SEED + i) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(SEED + i) + wavs, sr = model.generate_custom_voice( + text=text, + language=LANGUAGE, + speaker=VOICE, + temperature=TEMPERATURE, + top_p=0.9, + repetition_penalty=1.1, + ) + sf.write(out_path, wavs[0], sr) + duration = len(wavs[0]) / sr + print(f" -> {{filename}} ({{duration:.2f}}s @ {{sr}} Hz)") + manifest.append({{"index": i, "file": filename, "text": text, "duration_sec": round(duration, 2)}}) + + manifest_path = f"{{OUTPUT_DIR}}/manifest.json" + with open(manifest_path, "w", encoding="utf-8") as f: + json.dump(manifest, f, ensure_ascii=False, indent=2) + + print(f"\\nDone — {{len(TEXTS)}} audio files + manifest.json written to {{OUTPUT_DIR}}") + for entry in manifest: + print(f" {{entry['file']}} {{entry['duration_sec']}}s {{entry['text'][:60]}}") """) + summary_lines = [f"- `audio_{i:03d}.wav`: {t[:80]}{'…' if len(t) > 80 else ''}\n" for i, t in enumerate(texts, 1)] notebook = { "nbformat": 4, "nbformat_minor": 5, @@ -148,11 +188,10 @@ def pip_install(*packages): "id": "intro", "metadata": {}, "source": [ - "# VoiceGenHub — Qwen3-TTS GPU Generation\n", - f"**Model:** `{model_id}` \n", - f"**Text:** {text[:120]}{'...' if len(text) > 120 else ''} \n", - f"**Voice:** {voice} **Language:** {qwen_language}\n", - ], + "# VoiceGenHub — Qwen3-TTS GPU Batch Generation\n", + f"**Model:** `{model_id}` **Voice:** {voice} **Language:** {qwen_language}\n\n", + f"**{len(texts)} texts to synthesize:**\n", + ] + summary_lines, }, { "cell_type": "code", @@ -297,45 +336,53 @@ def __init__( def run( self, - text: str, + texts, voice: str = "Ryan", language: str = "en", output_dir: Optional[str] = None, - output_filename: str = "qwen3_tts.wav", - gpu_type: str = "p100", # "p100" or "t4" - ) -> Path: + gpu_type: str = "p100", + seed: int = 42, + temperature: float = 0.7, + ) -> list: """ - Run the full Kaggle Qwen3-TTS pipeline. + Run the full Kaggle Qwen3-TTS batch pipeline. Args: - text: Text to synthesize. + texts: A single text string or a list of text strings to synthesize. + Each text produces one audio file (audio_001.wav, …). voice: Speaker name (e.g. "Ryan", "Serena"). language: ISO language code (e.g. "en", "zh"). - output_dir: Local directory for the downloaded audio file. + output_dir: Local directory for all downloaded files. Defaults to a timestamped folder in the cwd. - output_filename: Filename for the generated audio on Kaggle. gpu_type: Kaggle accelerator type ("p100", "t4"). Returns: - Path to the downloaded audio file. + List of Paths to the downloaded audio files. + A manifest.json is written alongside the audio files linking each + filename to its source text. """ + # Normalise: accept both str and list[str] + if isinstance(texts, str): + texts = [texts] + username = _detect_kaggle_username() kernel_id = f"{username}/{self.kernel_slug}" if output_dir is None: from datetime import datetime - output_dir = datetime.now().strftime("%Y%m%d") + output_dir = datetime.now().strftime("%Y%m%d_%H%M%S") + f"_{gpu_type}" output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) logger.info( - "Starting Kaggle Qwen3-TTS pipeline", + "Starting Kaggle Qwen3-TTS batch pipeline", kernel_id=kernel_id, model=self.model_id, voice=voice, language=language, gpu_type=gpu_type, + num_texts=len(texts), ) # 1. Build notebook + push @@ -345,15 +392,21 @@ def run( metadata_path = Path(tmpdir) / "kernel-metadata.json" notebook = _build_notebook_source( - text=text, + texts=texts, voice=voice, language=language, model_id=self.model_id, dtype=self.dtype, - output_filename=output_filename, + seed=seed, + temperature=temperature, ) notebook_path.write_text(json.dumps(notebook, indent=2)) + # Save a copy of the submitted notebook to the output folder for traceability + submitted_nb_dest = output_path / "submitted_notebook.ipynb" + submitted_nb_dest.write_text(json.dumps(notebook, indent=2)) + logger.info(f"Submitted notebook saved: {submitted_nb_dest}") + metadata = _build_kernel_metadata( username, self.kernel_slug, notebook_filename, gpu_type=gpu_type ) @@ -361,7 +414,6 @@ def run( logger.info(f"Pushing kernel to Kaggle: {kernel_id} (accelerator: {gpu_type})") try: - # Accelerator flag to ensure correct resource allocation acc_flag = "nvidia-p100-1" if gpu_type == "p100" else "nvidia-t4-2" push_result = _run_cmd( ["kaggle", "kernels", "push", "-p", tmpdir, "--accelerator", acc_flag], @@ -371,9 +423,6 @@ def run( push_out = push_result.stdout.strip() logger.info(f"Push result: {push_out}") - # Kaggle may create the kernel under a different slug than the - # metadata 'id' field (it slugifies the 'title' instead when - # they differ). Parse the actual URL from the push output. actual_kernel_id = _extract_kernel_id_from_push(push_out, kernel_id) if actual_kernel_id != kernel_id: logger.info( @@ -391,14 +440,15 @@ def run( # 2. Poll until done self._poll_until_complete(kernel_id) - # 3. Download output - local_wav = self._download_output(kernel_id, output_path, output_filename) + # 3. Download all output files (audio_*.wav + manifest.json) + local_files = self._download_output(kernel_id, output_path, len(texts)) logger.info( - "Kaggle Qwen3-TTS pipeline complete", - output=str(local_wav), + "Kaggle Qwen3-TTS batch pipeline complete", + output_dir=str(output_path), + num_files=len(local_files), ) - return local_wav + return local_files # ------------------------------------------------------------------ # Internal helpers @@ -449,9 +499,12 @@ def _download_output( self, kernel_id: str, output_path: Path, - output_filename: str, - ) -> Path: - """Download kernel output and extract the audio file.""" + num_texts: int, + ) -> list: + """Download all kernel outputs (audio_*.wav + manifest.json) to output_path. + + Returns a list of Path objects for each downloaded audio file. + """ with tempfile.TemporaryDirectory() as dl_dir: logger.info(f"Downloading kernel outputs from {kernel_id}…") _run_cmd( @@ -460,34 +513,59 @@ def _download_output( check=True, ) - # Kaggle downloads a zip file; find and extract it dl_path = Path(dl_dir) - wav_files = list(dl_path.rglob("*.wav")) - zip_files = list(dl_path.rglob("*.zip")) - # Extract zips first - for zf in zip_files: + # Extract any zip archives first + for zf in list(dl_path.rglob("*.zip")): logger.info(f"Extracting {zf.name}…") with zipfile.ZipFile(zf, "r") as z: z.extractall(dl_path) - wav_files = list(dl_path.rglob("*.wav")) + + wav_files = sorted(dl_path.rglob("*.wav")) + manifest_files = list(dl_path.rglob("manifest.json")) + + # Always copy logs and executed notebook — survive even if no wavs + for log_file in dl_path.rglob("*.log"): + dest = output_path / log_file.name + shutil.copy2(log_file, dest) + logger.info(f"Kernel log saved: {dest} ({dest.stat().st_size:,} bytes)") + + for nb_file in dl_path.rglob("*.ipynb"): + dest = output_path / "executed_notebook.ipynb" + shutil.copy2(nb_file, dest) + logger.info(f"Executed notebook saved: {dest} ({dest.stat().st_size:,} bytes)") if not wav_files: - # List what was downloaded for debugging all_files = list(dl_path.rglob("*")) file_list = ", ".join(f.name for f in all_files if f.is_file()) raise FileNotFoundError( - f"No .wav file found in kernel output. Downloaded files: {file_list}\n" + f"No .wav files found in kernel output. Downloaded files: {file_list}\n" f"Check kernel logs: https://www.kaggle.com/code/{kernel_id}" ) - # Find the right wav (matching output_filename if possible) - target_wav = next( - (f for f in wav_files if f.name == output_filename), - wav_files[0], - ) + # Copy all wav files + local_wavs = [] + for wav in wav_files: + dest = output_path / wav.name + shutil.copy2(wav, dest) + logger.info(f"Audio saved locally: {dest} ({dest.stat().st_size:,} bytes)") + local_wavs.append(dest) + + # Copy manifest.json if present + if manifest_files: + manifest_dest = output_path / "manifest.json" + shutil.copy2(manifest_files[0], manifest_dest) + logger.info(f"Manifest saved locally: {manifest_dest}") + else: + # Generate a minimal fallback manifest + manifest_dest = output_path / "manifest.json" + fallback = [ + {"index": i + 1, "file": wav.name, "text": f"(text {i + 1} of {num_texts})"} + for i, wav in enumerate(local_wavs) + ] + manifest_dest.write_text( + json.dumps(fallback, ensure_ascii=False, indent=2), encoding="utf-8" + ) + logger.info(f"Fallback manifest written: {manifest_dest}") - dest = output_path / output_filename - shutil.copy2(target_wav, dest) - logger.info(f"Audio saved locally: {dest}") - return dest + return local_wavs diff --git a/src/voicegenhub/providers/qwen.py b/src/voicegenhub/providers/qwen.py index 4a36e9b..e36918f 100644 --- a/src/voicegenhub/providers/qwen.py +++ b/src/voicegenhub/providers/qwen.py @@ -27,6 +27,22 @@ logger = get_logger(__name__) +# Speaker metadata: name -> (language, locale, gender, description). +# The model's get_supported_speakers() only returns names; this dict supplies +# the additional info needed to build a Voice object. New speakers returned +# by the model but absent here fall back to neutral/multilingual defaults. +_SPEAKER_META: Dict[str, tuple] = { + "Ryan": ("en", "en-US", VoiceGender.MALE, "Dynamic male, strong rhythmic drive — English native"), + "Aiden": ("en", "en-US", VoiceGender.MALE, "Sunny American male, clear midrange — English native"), + "Vivian": ("zh", "zh-CN", VoiceGender.FEMALE, "Bright, slightly edgy young female — Chinese native"), + "Serena": ("zh", "zh-CN", VoiceGender.FEMALE, "Warm, gentle young female — Chinese native"), + "Uncle_Fu": ("zh", "zh-CN", VoiceGender.MALE, "Seasoned male, low mellow timbre — Chinese native"), + "Dylan": ("zh", "zh-CN", VoiceGender.MALE, "Youthful Beijing male, natural timbre — Chinese native"), + "Eric": ("zh", "zh-CN", VoiceGender.MALE, "Lively Chengdu male, slightly husky — Chinese native"), + "Ono_Anna": ("ja", "ja-JP", VoiceGender.FEMALE, "Playful female, light and nimble — Japanese native"), + "Sohee": ("ko", "ko-KR", VoiceGender.FEMALE, "Warm female with rich emotion — Korean native"), +} + class QwenTTSProvider(TTSProvider): """ @@ -173,51 +189,55 @@ async def initialize(self) -> None: provider=self.provider_id, ) from e - async def get_voices(self) -> List[Voice]: - """Get available voices based on generation mode.""" - await self.initialize() - - voices = [] + async def get_voices(self, language: Optional[str] = None) -> List[Voice]: + """Return Qwen3-TTS CustomVoice speakers by querying the loaded model. - if self.generation_mode == "custom_voice": - # Get supported speakers - try: - speakers = self._model.model.get_supported_speakers() - if speakers: - for speaker in speakers: - voices.append( - Voice( - id=speaker, - name=speaker.capitalize(), - language="multilingual", - locale="mul", - gender=VoiceGender.NEUTRAL, - voice_type=VoiceType.NEURAL, - provider=self.provider_id, - sample_rate=24000, - description=f"Qwen 3 TTS CustomVoice speaker: {speaker}", - ) - ) - except Exception as e: - logger.warning(f"Could not get speakers: {e}") - - if not voices: - # Return generic voice entries for other modes - voices.append( - Voice( - id="default", - name="Default Voice", - language="multilingual", - locale="mul", - gender=VoiceGender.NEUTRAL, - voice_type=VoiceType.NEURAL, - provider=self.provider_id, - sample_rate=24000, - description=f"Qwen 3 TTS {self.generation_mode} mode", + Speakers are enriched with language/gender metadata from _SPEAKER_META. + Speakers not in _SPEAKER_META fall back to neutral/multilingual defaults. + If *language* is provided (e.g. 'en', 'zh', 'ja', 'ko'), only voices whose + native language matches are returned. If no match, the full list is returned. + """ + await self.initialize() + speakers = self._model.model.get_supported_speakers() or [] + + if not speakers: + return [Voice( + id="default", + name="Default", + language="multilingual", + locale="multilingual", + gender=VoiceGender.NEUTRAL, + voice_type=VoiceType.NEURAL, + provider="qwen", + )] + + voices: List[Voice] = [] + for speaker in speakers: + meta = _SPEAKER_META.get(speaker) + if meta: + lang, locale, gender, desc = meta + else: + lang, locale, gender, desc = ( + "multilingual", "multilingual", VoiceGender.NEUTRAL, + f"{speaker} speaker", ) - ) - - return voices + voices.append(Voice( + id=speaker, + name=speaker, + language=lang, + locale=locale, + gender=gender, + voice_type=VoiceType.NEURAL, + provider="qwen", + description=desc, + )) + + if language is None: + return voices + + lang_filter = language.lower().split("-")[0] # normalise "en-US" → "en" + filtered = [v for v in voices if v.language == lang_filter] + return filtered if filtered else voices def _get_native_speaker_for_language(self, language: str) -> str: """Get native speaker for a given language.""" From 4df5bcc97faf17417a17c7a00459353ca8e052ba Mon Sep 17 00:00:00 2001 From: leweex95 Date: Fri, 27 Feb 2026 17:26:06 +0100 Subject: [PATCH 12/13] added seed forwarding to local providers and instruct passthrough to kaggle pipeline --- src/voicegenhub/cli.py | 52 ++++++++++++++++++++++-------- src/voicegenhub/kaggle/pipeline.py | 18 ++++++++++- src/voicegenhub/providers/qwen.py | 9 ++++++ 3 files changed, 65 insertions(+), 14 deletions(-) diff --git a/src/voicegenhub/cli.py b/src/voicegenhub/cli.py index e2a5155..a838f7d 100644 --- a/src/voicegenhub/cli.py +++ b/src/voicegenhub/cli.py @@ -46,6 +46,7 @@ def _process_single( instruct: Optional[str] = None, ref_audio: Optional[str] = None, ref_text: Optional[str] = None, + seed: Optional[int] = None, ): """Process a single text with effects support.""" try: @@ -53,6 +54,22 @@ def _process_single( tts = VoiceGenHub(provider=provider) asyncio.run(tts.initialize()) + # Build extra kwargs + extra_kwargs = dict( + exaggeration=exaggeration, + cfg_weight=cfg_weight, + ) + if audio_prompt_path: + extra_kwargs["audio_prompt_path"] = audio_prompt_path + if instruct: + extra_kwargs["instruct"] = instruct + if ref_audio: + extra_kwargs["ref_audio"] = ref_audio + if ref_text: + extra_kwargs["ref_text"] = ref_text + if seed is not None: + extra_kwargs["seed"] = seed + # Generate audio response = asyncio.run(tts.generate( text=text, @@ -61,12 +78,7 @@ def _process_single( audio_format=AudioFormat(audio_format), speed=speed, pitch=pitch, - exaggeration=exaggeration, - cfg_weight=cfg_weight, - audio_prompt_path=audio_prompt_path, - instruct=instruct, - ref_audio=ref_audio, - ref_text=ref_text, + **extra_kwargs, )) output_path = Path(output).resolve() if output else Path(".") / f"voicegenhub_output.{audio_format}" @@ -160,6 +172,7 @@ def _process_batch( instruct: Optional[str] = None, ref_audio: Optional[str] = None, ref_text: Optional[str] = None, + seed: Optional[int] = None, ): """Process multiple texts concurrently with provider-specific limits. @@ -216,6 +229,20 @@ def process_item(index: int, text: str): try: # Run async generation in thread async def generate(): + gen_kwargs = dict( + exaggeration=exaggeration, + cfg_weight=cfg_weight, + ) + if audio_prompt_path: + gen_kwargs["audio_prompt_path"] = audio_prompt_path + if instruct: + gen_kwargs["instruct"] = instruct + if ref_audio: + gen_kwargs["ref_audio"] = ref_audio + if ref_text: + gen_kwargs["ref_text"] = ref_text + if seed is not None: + gen_kwargs["seed"] = seed return await shared_tts.generate( text=text, voice=voice, @@ -223,12 +250,7 @@ async def generate(): audio_format=AudioFormat(audio_format), speed=speed, pitch=pitch, - exaggeration=exaggeration, - cfg_weight=cfg_weight, - audio_prompt_path=audio_prompt_path, - instruct=instruct, - ref_audio=ref_audio, - ref_text=ref_text, + **gen_kwargs, ) response = asyncio.run(generate()) @@ -256,6 +278,7 @@ async def generate(): instruct=instruct, ref_audio=ref_audio, ref_text=ref_text, + seed=seed, ) else: # Save output directly @@ -434,7 +457,7 @@ def cli(): type=int, default=42, show_default=True, - help="Kaggle GPU: Random seed for reproducible generation", + help="Random seed for reproducible generation (local CPU and Kaggle GPU)", ) @click.option( "--temperature", @@ -480,6 +503,7 @@ def synthesize( gpu_type=gpu, seed=seed, temperature=temperature, + instruct=instruct or "", ) click.echo(f"SUCCESS: {len(result_paths)} audio file(s) in: {Path(resolved_output_dir).absolute()}") for p in result_paths: @@ -563,6 +587,7 @@ def synthesize( instruct=instruct, ref_audio=ref_audio, ref_text=ref_text, + seed=seed, ) else: # Single text processing (original behavior) @@ -587,6 +612,7 @@ def synthesize( instruct=instruct, ref_audio=ref_audio, ref_text=ref_text, + seed=seed, ) diff --git a/src/voicegenhub/kaggle/pipeline.py b/src/voicegenhub/kaggle/pipeline.py index 8224f94..78cb007 100644 --- a/src/voicegenhub/kaggle/pipeline.py +++ b/src/voicegenhub/kaggle/pipeline.py @@ -68,11 +68,16 @@ def _build_notebook_source( dtype: str, seed: int = 42, temperature: float = 0.7, + instruct: str = "", ) -> dict: """Build the Jupyter notebook content for Kaggle GPU batch execution. Generates one audio file per text entry (audio_001.wav, audio_002.wav, …) and writes a manifest.json that maps each filename to its source text. + + When *instruct* is non-empty, each text is generated via + ``generate_custom_voice(… instruct=instruct)`` so Qwen3's voice-design / + emotion capabilities are used even on remote Kaggle GPUs. """ # Language mapping (CLI code → Qwen language string) @@ -119,6 +124,7 @@ def pip_install(*packages): OUTPUT_DIR = "/kaggle/working" SEED = {seed} TEMPERATURE = {temperature} + INSTRUCT = {json.dumps(instruct)} # Pin global seed for reproducibility across runs torch.manual_seed(SEED) @@ -131,6 +137,8 @@ def pip_install(*packages): if torch.cuda.is_available(): print(f"GPU: {{torch.cuda.get_device_name(0)}}") print(f"Seed: {{SEED}} Temperature: {{TEMPERATURE}}") + if INSTRUCT: + print(f"Instruct: {{INSTRUCT}}") print(f"Loading model: {{MODEL_ID}}") model = Qwen3TTSModel.from_pretrained( @@ -148,7 +156,7 @@ def pip_install(*packages): torch.manual_seed(SEED + i) if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED + i) - wavs, sr = model.generate_custom_voice( + gen_kwargs = dict( text=text, language=LANGUAGE, speaker=VOICE, @@ -156,6 +164,9 @@ def pip_install(*packages): top_p=0.9, repetition_penalty=1.1, ) + if INSTRUCT: + gen_kwargs["instruct"] = INSTRUCT + wavs, sr = model.generate_custom_voice(**gen_kwargs) sf.write(out_path, wavs[0], sr) duration = len(wavs[0]) / sr print(f" -> {{filename}} ({{duration:.2f}}s @ {{sr}} Hz)") @@ -343,6 +354,7 @@ def run( gpu_type: str = "p100", seed: int = 42, temperature: float = 0.7, + instruct: str = "", ) -> list: """ Run the full Kaggle Qwen3-TTS batch pipeline. @@ -355,6 +367,9 @@ def run( output_dir: Local directory for all downloaded files. Defaults to a timestamped folder in the cwd. gpu_type: Kaggle accelerator type ("p100", "t4"). + seed: Random seed for reproducible generation. + temperature: Sampling temperature. + instruct: Qwen3 instruct string for voice design / emotion control. Returns: List of Paths to the downloaded audio files. @@ -399,6 +414,7 @@ def run( dtype=self.dtype, seed=seed, temperature=temperature, + instruct=instruct, ) notebook_path.write_text(json.dumps(notebook, indent=2)) diff --git a/src/voicegenhub/providers/qwen.py b/src/voicegenhub/providers/qwen.py index e36918f..30a59b0 100644 --- a/src/voicegenhub/providers/qwen.py +++ b/src/voicegenhub/providers/qwen.py @@ -292,6 +292,15 @@ async def synthesize(self, request: TTSRequest) -> TTSResponse: } generate_kwargs.update(request.extra_params) + # Seed support: pin torch seed before generation for reproducibility + seed = generate_kwargs.pop("seed", None) + if seed is not None: + seed = int(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + logger.info(f"Pinned random seed: {seed}") + # Generate based on mode if self.generation_mode == "custom_voice": speaker = generate_kwargs.pop("speaker", self.default_speaker or request.voice_id) From 7d4c83e69a8599af74eb50a5333622e93dae8fd1 Mon Sep 17 00:00:00 2001 From: leweex95 Date: Sat, 28 Feb 2026 08:28:02 +0100 Subject: [PATCH 13/13] voice cloning confirmed to run --- README.md | 21 +++ src/voicegenhub/cli.py | 2 + src/voicegenhub/kaggle/pipeline.py | 259 ++++++++++++++++++++++++----- src/voicegenhub/providers/qwen.py | 8 +- 4 files changed, 248 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index a8c8ff7..10c4123 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,27 @@ poetry run voicegenhub synthesize "Hello!" \ --poll-interval 30 ``` +### Voice Cloning (Qwen3-TTS, Kaggle GPU) + +Clone your own voice onto arbitrary text using the Qwen3-TTS Base model and a reference WAV: + +```bash +poetry run voicegenhub synthesize "this is my speech using my own voice" \ + --provider qwen \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-Base \ + --audio-prompt "" \ + --ref-text "" \ + --gpu p100 +``` + +**Tips:** +- Use a reference WAV of at least 20 seconds, with clear speech and a matching transcript for best results. +- The `--ref-text` should be the exact transcript of the reference audio (no ellipsis or truncation). +- For batch synthesis, pass multiple texts in quotes. + +See [docs/cloning_and_design.md](docs/cloning_and_design.md) for advanced usage and troubleshooting. +``` + Batch output lands in a timestamped folder (e.g. `20260227_123130_p100/`) with: - `audio_001.wav`, `audio_002.wav`, … (one per input sentence) - `manifest.json` — maps each filename to its source text and duration diff --git a/src/voicegenhub/cli.py b/src/voicegenhub/cli.py index a838f7d..721ff22 100644 --- a/src/voicegenhub/cli.py +++ b/src/voicegenhub/cli.py @@ -504,6 +504,8 @@ def synthesize( seed=seed, temperature=temperature, instruct=instruct or "", + ref_audio_path=audio_prompt or "", + ref_text=ref_text or "", ) click.echo(f"SUCCESS: {len(result_paths)} audio file(s) in: {Path(resolved_output_dir).absolute()}") for p in result_paths: diff --git a/src/voicegenhub/kaggle/pipeline.py b/src/voicegenhub/kaggle/pipeline.py index 78cb007..4ddab6c 100644 --- a/src/voicegenhub/kaggle/pipeline.py +++ b/src/voicegenhub/kaggle/pipeline.py @@ -69,15 +69,20 @@ def _build_notebook_source( seed: int = 42, temperature: float = 0.7, instruct: str = "", + ref_audio_kernel_path: str = "", + ref_text: str = "", ) -> dict: """Build the Jupyter notebook content for Kaggle GPU batch execution. Generates one audio file per text entry (audio_001.wav, audio_002.wav, …) and writes a manifest.json that maps each filename to its source text. - When *instruct* is non-empty, each text is generated via - ``generate_custom_voice(… instruct=instruct)`` so Qwen3's voice-design / - emotion capabilities are used even on remote Kaggle GPUs. + When *ref_audio_kernel_path* is non-empty (e.g. + ``/kaggle/input/voicegenhub-ref-audio/levi_voice.wav``) the notebook calls + ``generate_voice_clone()`` using that file as the reference speaker. When + *instruct* is also provided it is forwarded to the clone call for + style/emotion control. When only *instruct* is set, the named VOICE + speaker is used via ``generate_custom_voice(instruct=…)``. """ # Language mapping (CLI code → Qwen language string) @@ -117,14 +122,16 @@ def pip_install(*packages): import soundfile as sf from qwen_tts import Qwen3TTSModel - MODEL_ID = {json.dumps(model_id)} - VOICE = {json.dumps(voice)} - LANGUAGE = {json.dumps(qwen_language)} - TEXTS = {json.dumps(texts)} - OUTPUT_DIR = "/kaggle/working" - SEED = {seed} - TEMPERATURE = {temperature} - INSTRUCT = {json.dumps(instruct)} + MODEL_ID = {json.dumps(model_id)} + VOICE = {json.dumps(voice)} + LANGUAGE = {json.dumps(qwen_language)} + TEXTS = {json.dumps(texts)} + OUTPUT_DIR = "/kaggle/working" + SEED = {seed} + TEMPERATURE = {temperature} + INSTRUCT = {json.dumps(instruct)} + REF_AUDIO_PATH = {json.dumps(ref_audio_kernel_path)} + REF_TEXT = {json.dumps(ref_text)} # Pin global seed for reproducibility across runs torch.manual_seed(SEED) @@ -139,6 +146,11 @@ def pip_install(*packages): print(f"Seed: {{SEED}} Temperature: {{TEMPERATURE}}") if INSTRUCT: print(f"Instruct: {{INSTRUCT}}") + if REF_AUDIO_PATH: + print(f"Voice clone mode: reference audio at {{REF_AUDIO_PATH}}") + + # Reference audio path set directly from Kaggle dataset input + _ref_audio_path = REF_AUDIO_PATH if REF_AUDIO_PATH else None print(f"Loading model: {{MODEL_ID}}") model = Qwen3TTSModel.from_pretrained( @@ -147,6 +159,16 @@ def pip_install(*packages): dtype=torch.float16, ) + # Guard: verify the loaded model supports voice cloning before entering the loop + if _ref_audio_path: + _mt = getattr(model.model, 'tts_model_type', 'unknown') + if _mt != 'base': + raise ValueError( + "Voice cloning requires tts_model_type='base' but got: " + str(_mt) + + ". MODEL_ID=" + MODEL_ID + " does not support generate_voice_clone(). " + "Switch to a Qwen3-TTS base model (e.g. Qwen/Qwen3-TTS-12Hz-1.7B-Base)." + ) + manifest = [] for i, text in enumerate(TEXTS, start=1): filename = f"audio_{{i:03d}}.wav" @@ -156,17 +178,47 @@ def pip_install(*packages): torch.manual_seed(SEED + i) if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED + i) - gen_kwargs = dict( - text=text, - language=LANGUAGE, - speaker=VOICE, - temperature=TEMPERATURE, - top_p=0.9, - repetition_penalty=1.1, - ) - if INSTRUCT: - gen_kwargs["instruct"] = INSTRUCT - wavs, sr = model.generate_custom_voice(**gen_kwargs) + if _ref_audio_path: + # non_streaming_mode=True is required — the default (False) simulates + # streaming and does not terminate properly for single non-streaming calls, + # causing runaway generation (e.g. 10+ minutes of garbage audio). + clone_kwargs = dict( + text=text, + language=LANGUAGE, + ref_audio=_ref_audio_path, + temperature=TEMPERATURE, + top_p=0.9, + repetition_penalty=1.1, + non_streaming_mode=True, + ) + if REF_TEXT: + # Strip trailing ellipsis — ICL mode requires ref_text to match + # the actual audio content; truncated text causes runaway generation. + _clean_ref_text = REF_TEXT.rstrip(". ").rstrip("…").rstrip(".") + clone_kwargs["ref_text"] = _clean_ref_text + if INSTRUCT: + clone_kwargs["instruct"] = INSTRUCT + try: + wavs, sr = model.generate_voice_clone(**clone_kwargs) + except TypeError as _e: + if INSTRUCT and "instruct" in str(_e): + print("Note: instruct not supported in clone mode, retrying without: " + str(_e)) + del clone_kwargs["instruct"] + wavs, sr = model.generate_voice_clone(**clone_kwargs) + else: + raise + else: + gen_kwargs = dict( + text=text, + language=LANGUAGE, + speaker=VOICE, + temperature=TEMPERATURE, + top_p=0.9, + repetition_penalty=1.1, + ) + if INSTRUCT: + gen_kwargs["instruct"] = INSTRUCT + wavs, sr = model.generate_custom_voice(**gen_kwargs) sf.write(out_path, wavs[0], sr) duration = len(wavs[0]) / sr print(f" -> {{filename}} ({{duration:.2f}}s @ {{sr}} Hz)") @@ -225,8 +277,52 @@ def pip_install(*packages): return notebook +def _upload_ref_audio_dataset(audio_file: Path, username: str) -> str: + """Upload *audio_file* to a private Kaggle dataset and return the slug. + + Uses the stable slug ``voicegenhub-ref-audio``. If the dataset does not + yet exist the first call creates it; subsequent calls push a new version, + making this fully idempotent from the caller's perspective. + """ + dataset_slug = "voicegenhub-ref-audio" + with tempfile.TemporaryDirectory() as ds_dir: + ds_path = Path(ds_dir) + shutil.copy2(audio_file, ds_path / audio_file.name) + ds_meta = { + "title": "VoiceGenHub Reference Audio", + "id": f"{username}/{dataset_slug}", + "licenses": [{"name": "other"}], + } + (ds_path / "dataset-metadata.json").write_text(json.dumps(ds_meta)) + # Try updating an existing version first; fall back to creating from scratch. + try: + result = _run_cmd( + ["kaggle", "datasets", "version", "-p", ds_dir, + "-m", "voicegenhub ref audio update", "-q"], + capture=True, check=True, + ) + logger.info(f"Reference audio dataset version pushed: {result.stdout.strip()}") + except subprocess.CalledProcessError: + try: + result = _run_cmd( + ["kaggle", "datasets", "create", "-p", ds_dir, "-q"], + capture=True, check=True, + ) + logger.info(f"Reference audio dataset created: {result.stdout.strip()}") + except subprocess.CalledProcessError as exc: + raise RuntimeError( + f"Failed to upload reference audio as Kaggle dataset.\n" + f"stdout: {exc.stdout.strip()}\nstderr: {exc.stderr.strip()}" + ) from exc + return dataset_slug + + def _build_kernel_metadata( - username: str, kernel_slug: str, notebook_filename: str, gpu_type: str = "p100" + username: str, + kernel_slug: str, + notebook_filename: str, + gpu_type: str = "p100", + dataset_sources: Optional[list] = None, ) -> dict: """Build Kaggle kernel-metadata.json. @@ -247,7 +343,7 @@ def _build_kernel_metadata( "enable_gpu": True, "enable_tpu": False, "enable_internet": True, - "dataset_sources": [], + "dataset_sources": dataset_sources or [], "competition_sources": [], "kernel_sources": [], "model_sources": [], @@ -355,6 +451,8 @@ def run( seed: int = 42, temperature: float = 0.7, instruct: str = "", + ref_audio_path: str = "", + ref_text: str = "", ) -> list: """ Run the full Kaggle Qwen3-TTS batch pipeline. @@ -362,14 +460,24 @@ def run( Args: texts: A single text string or a list of text strings to synthesize. Each text produces one audio file (audio_001.wav, …). - voice: Speaker name (e.g. "Ryan", "Serena"). + voice: Speaker name (e.g. "Ryan", "Serena"). Ignored when + *ref_audio_path* is provided (voice cloning mode). language: ISO language code (e.g. "en", "zh"). output_dir: Local directory for all downloaded files. Defaults to a timestamped folder in the cwd. gpu_type: Kaggle accelerator type ("p100", "t4"). seed: Random seed for reproducible generation. temperature: Sampling temperature. - instruct: Qwen3 instruct string for voice design / emotion control. + instruct: Qwen3 instruct string for style/emotion control. Works + in both custom-voice and voice-clone modes. + ref_audio_path: Local path to a reference WAV file for voice + cloning. The file is uploaded once as a private + Kaggle dataset (``voicegenhub-ref-audio``) and + attached as a data source so the kernel can read + it from ``/kaggle/input/`` without any notebook + size inflation. + ref_text: Optional transcript of the reference audio. Improves + clone quality when provided. Returns: List of Paths to the downloaded audio files. @@ -380,6 +488,16 @@ def run( if isinstance(texts, str): texts = [texts] + # Guard: CustomVoice model variants do not support generate_voice_clone. + # Fail early with a clear message rather than a cryptic Kaggle kernel error. + if ref_audio_path and "CustomVoice" in self.model_id: + raise ValueError( + f"Voice cloning (--audio-prompt) is not supported by '{self.model_id}'.\n" + "CustomVoice variants only provide predefined speaker voices.\n" + "To clone a reference voice you must use a Qwen3-TTS *base* model.\n" + "Pass: --model Qwen/Qwen3-TTS-12Hz-1.7B-Base" + ) + username = _detect_kaggle_username() kernel_id = f"{username}/{self.kernel_slug}" @@ -406,6 +524,35 @@ def run( notebook_path = Path(tmpdir) / notebook_filename metadata_path = Path(tmpdir) / "kernel-metadata.json" + # ---------------------------------------------------------------- + # Upload reference audio as a private Kaggle dataset so the + # notebook can read it from /kaggle/input/ without any + # notebook-cell-size issues. + # ---------------------------------------------------------------- + ref_dataset_slug = None # set when a reference audio is provided + if ref_audio_path: + ref_audio_file = Path(ref_audio_path) + if not ref_audio_file.exists(): + raise FileNotFoundError( + f"Reference audio file not found: {ref_audio_path}" + ) + ref_dataset_slug = _upload_ref_audio_dataset( + ref_audio_file, username + ) + logger.info( + f"Reference audio dataset ready: {username}/{ref_dataset_slug}" + ) + + # Determine ref audio kernel path and dataset sources for metadata + ref_audio_kernel_path = "" + kernel_dataset_sources = [] + if ref_dataset_slug: + ref_audio_file = Path(ref_audio_path) + ref_audio_kernel_path = ( + f"/kaggle/input/{ref_dataset_slug}/{ref_audio_file.name}" + ) + kernel_dataset_sources = [f"{username}/{ref_dataset_slug}"] + notebook = _build_notebook_source( texts=texts, voice=voice, @@ -415,6 +562,8 @@ def run( seed=seed, temperature=temperature, instruct=instruct, + ref_audio_kernel_path=ref_audio_kernel_path, + ref_text=ref_text or "", ) notebook_path.write_text(json.dumps(notebook, indent=2)) @@ -424,7 +573,9 @@ def run( logger.info(f"Submitted notebook saved: {submitted_nb_dest}") metadata = _build_kernel_metadata( - username, self.kernel_slug, notebook_filename, gpu_type=gpu_type + username, self.kernel_slug, notebook_filename, + gpu_type=gpu_type, + dataset_sources=kernel_dataset_sources, ) metadata_path.write_text(json.dumps(metadata, indent=2)) @@ -520,24 +671,50 @@ def _download_output( """Download all kernel outputs (audio_*.wav + manifest.json) to output_path. Returns a list of Path objects for each downloaded audio file. + Retries up to 3 times with a 30-second delay to handle the case where + Kaggle marks a kernel COMPLETE before output files are fully staged. """ - with tempfile.TemporaryDirectory() as dl_dir: - logger.info(f"Downloading kernel outputs from {kernel_id}…") - _run_cmd( - ["kaggle", "kernels", "output", kernel_id, "-p", dl_dir], - capture=True, - check=True, - ) + max_retries = 3 + retry_delay_seconds = 30 + with tempfile.TemporaryDirectory() as dl_dir: dl_path = Path(dl_dir) + wav_files = [] + + for attempt in range(1, max_retries + 1): + logger.info(f"Downloading kernel outputs from {kernel_id}… (attempt {attempt}/{max_retries})") + try: + _run_cmd( + ["kaggle", "kernels", "output", kernel_id, "-p", dl_dir], + capture=True, + check=True, + ) + except subprocess.CalledProcessError as exc: + logger.warning(f"Download attempt {attempt} failed (exit {exc.returncode}): {exc.stderr}") + if attempt < max_retries: + logger.info(f"Retrying in {retry_delay_seconds}s…") + time.sleep(retry_delay_seconds) + continue + raise + + # Extract any zip archives first + for zf in list(dl_path.rglob("*.zip")): + logger.info(f"Extracting {zf.name}…") + with zipfile.ZipFile(zf, "r") as z: + z.extractall(dl_path) + + wav_files = sorted(dl_path.rglob("*.wav")) + if wav_files: + break + + # Kaggle sometimes returns COMPLETE before output files are staged + if attempt < max_retries: + logger.warning( + f"No .wav files found on attempt {attempt} — " + f"Kaggle output may not be staged yet. Retrying in {retry_delay_seconds}s…" + ) + time.sleep(retry_delay_seconds) - # Extract any zip archives first - for zf in list(dl_path.rglob("*.zip")): - logger.info(f"Extracting {zf.name}…") - with zipfile.ZipFile(zf, "r") as z: - z.extractall(dl_path) - - wav_files = sorted(dl_path.rglob("*.wav")) manifest_files = list(dl_path.rglob("manifest.json")) # Always copy logs and executed notebook — survive even if no wavs diff --git a/src/voicegenhub/providers/qwen.py b/src/voicegenhub/providers/qwen.py index 30a59b0..585d559 100644 --- a/src/voicegenhub/providers/qwen.py +++ b/src/voicegenhub/providers/qwen.py @@ -352,8 +352,9 @@ async def synthesize(self, request: TTSRequest) -> TTSResponse: ref_audio = generate_kwargs.pop("ref_audio", self.default_ref_audio) ref_text = generate_kwargs.pop("ref_text", self.default_ref_text) x_vector_only = generate_kwargs.pop("x_vector_only_mode", self.x_vector_only_mode) + instruct = generate_kwargs.pop("instruct", self.default_instruct) - wavs, sample_rate = self._model.generate_voice_clone( + clone_call_kwargs = dict( text=request.text, language=language, ref_audio=ref_audio, @@ -362,6 +363,11 @@ async def synthesize(self, request: TTSRequest) -> TTSResponse: non_streaming_mode=self.non_streaming_mode, **generate_kwargs ) + if instruct: + clone_call_kwargs["instruct"] = instruct + logger.info(f"Voice clone + instruct: '{instruct}'") + + wavs, sample_rate = self._model.generate_voice_clone(**clone_call_kwargs) else: raise TTSError(