diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6d08642..04e64a7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,6 +9,15 @@ repos: - id: check-merge-conflict - id: debug-statements + - repo: https://github.com/PyCQA/autoflake + rev: v2.3.1 + hooks: + - id: autoflake + args: + - --in-place + - --remove-unused-variables + - --remove-all-unused-imports + - repo: https://github.com/pycqa/flake8 rev: 7.0.0 hooks: diff --git a/README.md b/README.md index e779029..10c4123 100644 --- a/README.md +++ b/README.md @@ -1,466 +1,197 @@ -[![Unit tests](https://github.com/leweex95/voicegenhub/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/leweex95/voicegenhub/actions/workflows/unit-tests.yml) +[![Unit tests](https://github.com/leweex95/voicegenhub/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/leweex95/voicegenhub/actions/workflows/unit-tests.yml) [![Daily regression test](https://github.com/leweex95/voicegenhub/actions/workflows/daily-regression-test.yml/badge.svg)](https://github.com/leweex95/voicegenhub/actions/workflows/daily-regression-test.yml) [![codecov](https://codecov.io/gh/leweex95/voicegenhub/branch/master/graph/badge.svg)](https://codecov.io/gh/leweex95/voicegenhub) # VoiceGenHub -Simple, user-friendly Text-to-Speech (TTS) library with CLI and Python API. Supports multiple free and commercial TTS providers. +Simple CLI-first Text-to-Speech library supporting multiple free and commercial providers — including free Kaggle GPU inference for state-of-the-art models. -## Installation +--- -```bash -pip install voicegenhub -# or -poetry add voicegenhub -``` - -### Optional Dependencies - -- **Microsoft Edge TTS** (free, cloud-based) -- **Kokoro TTS** (Apache 2.0 licensed, self-hosted lightweight TTS) -- **Bark TTS** (MIT licensed, self-hosted high-naturalness TTS with prosody control) -- **Chatterbox TTS** (MIT licensed, multilingual with emotion control) - Works on CPU or GPU -- **Qwen 3 TTS** (Apache 2.0 licensed, multilingual with voice design and cloning) - State-of-the-art quality -- **ElevenLabs TTS** (commercial, high-quality voices) - -### Voice Cloning Support - -For voice cloning features with Chatterbox TTS: - -```bash -pip install voicegenhub[voice-cloning] -# or -poetry install -E voice-cloning -``` - -**Voice cloning requirements:** -- FFmpeg (manual installation required) -- PyTorch (standard version) - -**On Windows:** Download the "full-shared" FFmpeg build from [ffmpeg.org](https://ffmpeg.org/download.html#build-windows) and add the `bin` directory to your system PATH. +## Install -**Note:** VoiceGenHub includes a compatibility layer to ensure stable execution on CPU-only systems and prevents common import-time crashes related to experimental dependencies like TorchCodec. Standard TTS and voice cloning mechanisms will automatically fall back to supported audio loaders if needed. - -## Usage - -### Chatterbox TTS - -```bash -poetry run voicegenhub synthesize "Hello, world!" --provider chatterbox --voice chatterbox-default --output hello.wav -``` - -**Chatterbox features:** -- **Model selection via voice**: Choose between standard, turbo, or multilingual models using the `--voice` flag -- Emotion/intensity control with `exaggeration` parameter (0.0-1.0) -- Zero-shot voice cloning from audio samples -- MIT License - fully commercial compatible -- State-of-the-art quality (competitive with ElevenLabs) -- Built-in Perth watermarking for responsible AI - -**Chatterbox voices:** -- `chatterbox-default`: Standard English model with emotion control -- `chatterbox-turbo`: Turbo English model (faster generation, English only) -- `chatterbox-`: Multilingual model for specific languages (e.g., `chatterbox-es` for Spanish) - -**Chatterbox parameters:** -- `--exaggeration`: Emotion intensity (0.0-1.0, default 0.5). Higher values = more dramatic/emotional. -- `--cfg-weight`: Classifier-free guidance weight (0.0-1.0, default 0.5). Controls the influence of the text prompt. -- `--audio-prompt`: Path to reference audio for voice cloning (optional). -- `temperature`, `max_new_tokens`, `repetition_penalty`, `min_p`, `top_p`: Advanced generation parameters (available in Python API). - -**Multilingual Support:** -Chatterbox supports 23 languages. Use the appropriate voice for the target language: ```bash -poetry run voicegenhub synthesize "Hola, esto es una prueba de voz en español." --provider chatterbox --voice chatterbox-es --output spanish.wav -``` - -**Chatterbox supported languages:** ar, da, de, el, en, es, fi, fr, he, hi, it, ja, ko, ms, nl, no, pl, pt, ru, sv, sw, tr, zh - -**Chatterbox Installation Requirements:** -- **TorchCodec** (optional): Required for voice cloning features. Install with `pip install torchcodec` or `poetry install -E voice-cloning`. -- **FFmpeg**: Required when TorchCodec is installed for voice cloning. On Windows, install the "full-shared" build from [ffmpeg.org](https://ffmpeg.org/download.html#build-windows) and ensure FFmpeg's `bin` directory is in your system PATH. -- **PyTorch Compatibility**: TorchCodec 0.9.1 requires PyTorch ≤ 2.4.x. If you have a newer PyTorch version, voice cloning will be automatically disabled with a fallback to standard TTS. -- Without TorchCodec/FFmpeg, basic TTS will work but voice cloning (`--audio-prompt`) will gracefully fall back to standard TTS without cloning. - -### Qwen 3 TTS - -```bash -poetry run voicegenhub synthesize "Hello, world!" --provider qwen --voice Ryan --output hello.wav -``` - -**Qwen 3 TTS features:** -- **Three generation modes**: CustomVoice (predefined speakers), VoiceDesign (natural language voice description), VoiceClone (reference audio-based) -- **10 languages**: Chinese, English, French, German, Italian, Japanese, Korean, Portuguese, Russian, Spanish -- **Native speakers**: Automatic selection of native speakers per language for natural, accent-free speech -- **Voice control via natural language**: Use `instruct` parameter to control emotion, tone, speaking rate, and style -- **Ultra-low latency**: Streaming generation with <100ms first-token latency -- **Apache 2.0 License**: Fully commercial compatible -- **State-of-the-art quality**: Competitive with ElevenLabs, developed by Alibaba's Qwen team - -#### Mode 1: CustomVoice (Predefined Speakers) - -Use predefined premium speakers with optional emotion/style control: - -```bash -# Basic usage with auto-selected native speaker -poetry run voicegenhub synthesize "Hello, this is a test." --provider qwen --language en --output output.wav - -# Explicit speaker selection -poetry run voicegenhub synthesize "Hello, this is a test." --provider qwen --language en --voice Ryan --output output.wav - -# With emotion instruction -poetry run voicegenhub synthesize "I'm so excited about this news!" --provider qwen --language en --voice Ryan --instruct "Speak with excitement and joy" --output happy.wav +poetry add voicegenhub ``` -**Available speakers and their native languages:** - -| Speaker | Description | Native Language | Best For | -|---------|-------------|----------------|----------| -| **Ryan** | Dynamic male voice with strong rhythmic drive | English | English content, presentations | -| **Aiden** | Sunny American male voice with clear midrange | English | English content, narration | -| **Vivian** | Bright, slightly edgy young female voice | Chinese | Mandarin content, audiobooks | -| **Serena** | Warm, gentle young female voice | Chinese | Mandarin content, customer service | -| **Uncle_Fu** | Seasoned male voice with low, mellow timbre | Chinese | Mandarin narration, mature content | -| **Dylan** | Youthful Beijing male voice, natural timbre | Chinese (Beijing) | Beijing dialect content | -| **Eric** | Lively Chengdu male voice, slightly husky | Chinese (Sichuan) | Sichuan dialect content | -| **Ono_Anna** | Playful Japanese female, light and nimble | Japanese | Japanese content, anime | -| **Sohee** | Warm Korean female with rich emotion | Korean | Korean content, storytelling | - -**Auto-speaker selection:** If no speaker is specified, Qwen 3 TTS automatically selects a native speaker based on the target language (e.g., Ryan for English, Serena for Chinese). - -**Emotion and style control:** Use the `--instruct` parameter with natural language to control voice characteristics: -- `"Speak with excitement and joy"` -- `"Very angry tone"` -- `"Whisper gently"` -- `"Speak slowly and calmly"` -- `"Energetic and enthusiastic"` - -#### Mode 2: VoiceDesign (Natural Language Voice Description) - -Design custom voices using natural language instructions (requires `Qwen3-TTS-VoiceDesign` model): - -```python -from voicegenhub.providers.factory import provider_factory -from voicegenhub.providers.base import TTSRequest - -config = { - "model_name_or_path": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", - "generation_mode": "voice_design", -} - -await provider_factory.discover_provider("qwen") -provider = await provider_factory.create_provider("qwen", config=config) - -request = TTSRequest( - text="Welcome to our demonstration.", - language="en", - voice_id="default", - extra_params={ - "instruct": "Male, 30 years old, confident and professional tone, deep voice with clear articulation" - } -) -response = await provider.synthesize(request) -``` +--- -**VoiceDesign instruction examples:** -- `"Female, 25 years old, cheerful and energetic, slightly high-pitched with playful intonation"` -- `"Male, 17 years old, gaining confidence, deeper breath support, vowels tighten when nervous"` -- `"Elderly male, 70 years old, wise and gentle, slightly raspy with warm timbre"` - -#### Mode 3: VoiceClone (Reference Audio-Based) - -Clone voices from 3-second audio samples (requires `Qwen3-TTS-Base` model): - -```python -from voicegenhub.providers.factory import provider_factory -from voicegenhub.providers.base import TTSRequest - -config = { - "model_name_or_path": "Qwen/Qwen3-TTS-12Hz-1.7B-Base", - "generation_mode": "voice_clone", -} - -await provider_factory.discover_provider("qwen") -provider = await provider_factory.create_provider("qwen", config=config) - -request = TTSRequest( - text="This is synthesized using the cloned voice.", - language="en", - voice_id="default", - extra_params={ - "ref_audio": "path/to/reference.wav", # Can be local path, URL, or numpy array - "ref_text": "Transcript of the reference audio", # Required for best quality - "x_vector_only_mode": False # Set True to skip ref_text (lower quality) - } -) -response = await provider.synthesize(request) -``` +## Providers -**Voice cloning tips:** -- Use clear, noise-free reference audio (3-10 seconds) -- Provide accurate transcript (`ref_text`) for best cloning quality -- Supports multilingual cloning (clone any language, synthesize in any language) -- Combine with VoiceDesign to create reusable custom voices +| Provider | License | Local / Cloud | Notes | +|---|---|---|---| +| **Edge TTS** | Free (Microsoft) | Cloud | Fastest, zero setup | +| **Kokoro** | Apache 2.0 | Local | Lightweight, high quality | +| **Bark** | MIT | Local | Prosody markers, 100+ voices | +| **Chatterbox** | MIT | Local | Emotion control, voice cloning | +| **Qwen 3 TTS** | Apache 2.0 | Local / Kaggle GPU | State-of-the-art multilingual | +| **ElevenLabs** | Paid API | Cloud | Commercial-grade voices | -#### Word Emphasis and Pause Control +--- -**Note:** Qwen 3 TTS does not support explicit word-level emphasis markup (like SSML tags) or pause control. Instead, the model intelligently interprets text and applies natural prosody based on: +## Synthesize -1. **Context understanding**: The model reads the entire sentence and applies appropriate emphasis to important words automatically -2. **Natural language instructions**: Use the `instruct` parameter to guide overall tone and pacing: - - `"Speak slowly with emphasis on key words"` - - `"Pause dramatically between sentences"` - - `"Fast-paced and energetic delivery"` -3. **Punctuation**: The model respects punctuation for natural pauses (commas, periods, ellipses, em-dashes) +### Edge TTS (fastest, no setup) -**Example:** ```bash -# The model will naturally emphasize "incredible results" due to context -poetry run voicegenhub synthesize "We achieved incredible results!" --provider qwen --voice Ryan --instruct "Speak with excitement and emphasis" --output emphasized.wav +poetry run voicegenhub synthesize "Hello, world!" --provider edge --voice en-US-AriaNeural --output hello.mp3 ``` -#### Model Selection - -Qwen 3 TTS offers multiple models optimized for different use cases: - -| Model | Size | Best For | Streaming | GPU Recommended |Supports | -|-------|------|----------|-----------|-----------------|---------| -| `Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice` | 600M | Default, fast generation, predefined speakers | ✅ | Optional | CustomVoice | -| `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | 1.7B | Higher quality, predefined speakers | ✅ | Yes | CustomVoice | -| `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | 1.7B | Custom voice design via natural language | ✅ | Yes | VoiceDesign | -| `Qwen/Qwen3-TTS-12Hz-1.7B-Base` | 1.7B | Voice cloning from audio samples | ✅ | Yes | VoiceClone | -| `Qwen/Qwen3-TTS-12Hz-0.6B-Base` | 600M | Voice cloning, faster generation | ✅ | Optional | VoiceClone | +### Kokoro -**Installation:** ```bash -pip install voicegenhub[qwen] -# or -poetry install --with qwen +poetry run voicegenhub synthesize "Hello, world!" --provider kokoro --voice kokoro-af_alloy --output hello.wav ``` -**Qwen 3 TTS parameters (Python API):** -- `model_name_or_path`: Model to use (see table above) -- `device`: "cuda", "cpu", or "auto" (default: auto) -- `dtype`: "float32", "float16", "bfloat16" (default: bfloat16) -- `attn_implementation`: "eager", "sdpa", "flash_attention_2" (default: eager) -- `generation_mode`: "custom_voice", "voice_design", "voice_clone" -- `speaker`: Speaker name for CustomVoice mode -- `instruct`: Emotion/style instruction (for CustomVoice) or voice description (for VoiceDesign) -- `temperature`, `top_p`, `top_k`, `repetition_penalty`, `max_new_tokens`: Advanced sampling parameters - ### Bark ```bash poetry run voicegenhub synthesize "Hello, world!" --provider bark --voice bark-en_speaker_0 --output hello.wav ``` -**Bark features:** -- Highest naturalness among open-source TTS -- Prosody markers for emotional expression: `[laughs]`, `[sighs]`, `[pause]`, `[whisper]` -- 100+ speaker presets -- Sound effects generation - -**Bark supported voices:** Use preset names like `bark-en_speaker_0`, `bark-en_speaker_1`, etc. +Bark supports prosody markers: `[laughs]`, `[sighs]`, `[pause]`, `[whisper]`. -### Edge TTS +### Chatterbox (emotion control + voice cloning) ```bash -poetry run voicegenhub synthesize "Hello, world!" --provider edge --voice en-US-AriaNeural --output hello.mp3 -``` - -**Edge TTS supported voices:** Check the list of supported voices [here](https://speech.microsoft.com/portal/voicegallery). +# Basic +poetry run voicegenhub synthesize "Hello, world!" --provider chatterbox --voice chatterbox-default --output hello.wav -### Kokoro TTS +# With emotion intensity +poetry run voicegenhub synthesize "This is incredible!" --provider chatterbox --voice chatterbox-default --exaggeration 0.8 --output excited.wav -```bash -poetry run voicegenhub synthesize "Hello, world!" --provider kokoro --voice kokoro-af_alloy --output hello.wav +# Voice cloning from a reference file +poetry run voicegenhub synthesize "Hello, cloned voice." --provider chatterbox --voice chatterbox-default --audio-prompt reference.wav --output cloned.wav ``` -**Kokoro supported voices:** Check the list of supported voices [here](https://github.com/nazdridoy/kokoro-tts?tab=readme-ov-file#supported-voices). - ### ElevenLabs ```bash poetry run voicegenhub synthesize "Hello, world!" --provider elevenlabs --voice elevenlabs-EXAVITQu4vr4xnSDxMaL --output hello.mp3 ``` -Set your API key in `config/elevenlabs-api-key.json` (the key should be stored as the value for `"ELEVENLABS_API_KEY"` in the JSON file). +Store your API key in `config/elevenlabs-api-key.json` as `{"ELEVENLABS_API_KEY": "..."}`. -**ElevenLabs supported voices:** Check the list of supported voices [here](https://elevenlabs.io/docs/voices). +--- -## Print all available voices per provider +## Qwen 3 TTS ```bash -poetry run voicegenhub voices --language en --provider chatterbox -poetry run voicegenhub voices --language en --provider bark -poetry run voicegenhub voices --language en --provider edge -poetry run voicegenhub voices --language en --provider kokoro -poetry run voicegenhub voices --language en --provider elevenlabs -``` +poetry run voicegenhub synthesize "Hello from the GPU!" --provider qwen --voice Ryan --language en --gpu p100 -## Batch Processing with Concurrency Control +# Batch multiple sentences in one GPU job, saved as audio_001.wav … audio_007.wav + manifest.json +poetry run voicegenhub synthesize \ + "The quick brown fox jumps over the lazy dog." \ + "Technology is changing the world at an unprecedented pace." \ + "The sunset painted the sky in shades of orange and pink." \ + --provider qwen --voice Ryan --language en --gpu p100 -Process multiple texts concurrently with automatic provider-specific resource management: +# Chinese with native speaker +poetry run voicegenhub synthesize "你好,这是一个测试。" --provider qwen --voice Serena --language zh --gpu p100 -```bash -# Process multiple texts (auto-numbered output files) -poetry run voicegenhub synthesize "First text" "Second text" "Third text" --provider edge --output batch_output +# Use T4 GPUs instead of P100 +poetry run voicegenhub synthesize "Hello!" --provider qwen --gpu t4 -# Control concurrency (auto-configured per provider if not specified) -poetry run voicegenhub synthesize "Text 1" "Text 2" --provider bark --max-concurrent 2 --output output +# Custom model, output directory, polling options +poetry run voicegenhub synthesize "Hello!" \ + --provider qwen \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ + --output-dir my_output \ + --gpu p100 \ + --timeout 90 \ + --poll-interval 30 ``` -**Provider Concurrency Limits (automatic):** -- **Fast providers** (Edge, Kokoro, ElevenLabs): Use all CPU cores -- **Heavy providers** (Bark: 2 concurrent, Chatterbox: 1 concurrent) - -**Benefits:** -- Model instances are shared across concurrent jobs (no reloading) -- Automatic resource management prevents system overload -- Progress tracking for each job -- Failed jobs don't stop the batch - -## Voice Cloning with Kokoro and Chatterbox - -VoiceGenHub supports zero-shot voice cloning by combining Kokoro's lightweight voices with Chatterbox's advanced cloning capabilities. This allows you to create custom voices that sound like Kokoro but with Chatterbox's superior quality and emotion control. +### Voice Cloning (Qwen3-TTS, Kaggle GPU) -### Step-by-Step Guide +Clone your own voice onto arbitrary text using the Qwen3-TTS Base model and a reference WAV: -1. **Generate a Kokoro voice sample** (modify as desired or keep undistorted): - ```bash - # Undistorted voice - poetry run voicegenhub synthesize "Sample text for cloning." --provider kokoro --voice kokoro-am_michael --output reference.wav --format wav - - # Or with effects (e.g., horror/distortion) - poetry run voicegenhub synthesize "Sample text for cloning." --provider kokoro --voice kokoro-am_adam --output reference.wav --format wav --pitch-shift -2 --distortion 0.02 --lowpass 2000 --normalize - ``` - -2. **Clone the voice with Chatterbox**: - ```bash - poetry run voicegenhub synthesize "Your longer text here." --provider chatterbox --voice chatterbox-default --output cloned_voice.wav --audio-prompt reference.wav - ``` - -3. **Optional: Adjust emotion and style**: - ```bash - poetry run voicegenhub synthesize "Your text." --provider chatterbox --voice chatterbox-default --output cloned_voice.wav --audio-prompt reference.wav --exaggeration 0.8 --cfg-weight 0.7 - ``` +```bash +poetry run voicegenhub synthesize "this is my speech using my own voice" \ + --provider qwen \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-Base \ + --audio-prompt "" \ + --ref-text "" \ + --gpu p100 +``` **Tips:** -- Use short, clear reference audio (5-10 seconds) for best cloning results -- Combine multiple Kokoro samples with FFmpeg for richer voice profiles -- Experiment with Kokoro effects to create unique voice characteristics before cloning -- Chatterbox supports multilingual cloning from any language reference audio - -## Concurrency and Memory Management - -**Async Concurrency (Recommended):** -- Use the `synthesize` command with multiple texts for safe concurrent processing within a single process -- Models are loaded once and shared across concurrent jobs -- Prevents out-of-memory (OOM) errors from duplicate model loading -- Automatic provider-specific limits ensure stability - -**Multiprocessing Risks:** -- Running multiple CLI processes simultaneously (e.g., via scripts or parallel jobs) loads separate model instances -- Heavy models like Chatterbox (3.7GB) and Bark (4GB) can cause OOM when duplicated across processes -- **Recommendation:** Use async batch processing instead of multiprocessing for heavy providers -- For light providers (Edge, Kokoro), multiprocessing is safer due to minimal memory footprint - -## Performance Comparison: All TTS Providers - -Here's how all providers compare in terms of speed and quality: - -| Provider | Quality (MOS) | Startup Time | Sequential (per req) | Async (3x parallel) | Model Size | Commercial Licensed | -|----------|---------------|--------------|---------------------|-------------------|------------|----------------| -| **Edge TTS** | 3.8/5 | 4.9s | 3.2s | 2.5s | 0MB (cloud) | ✅ Free | -| **Kokoro** | 3.5/5 | 94s | 14.2s | 2.5s | 625MB | ✅ Apache 2.0 | -| **Bark** | 4.2/5 | 180s | 25-40s | 8-12s | 4GB | ✅ MIT | -| **Chatterbox** | 4.3/5 | 120s | 15-30s | 5-15s | 3.7GB | ✅ MIT | -| **ElevenLabs** | 4.5/5* | 2s | 3-5s | 2-3s | 0MB (cloud) | ⚠️ Paid API | - -*ElevenLabs quality estimate based on provider reputation; not yet tested with API key. +- Use a reference WAV of at least 20 seconds, with clear speech and a matching transcript for best results. +- The `--ref-text` should be the exact transcript of the reference audio (no ellipsis or truncation). +- For batch synthesis, pass multiple texts in quotes. -**Key Findings:** -- **Chatterbox**: Excellent quality with emotion control and multilingual support; MIT licensed, works on CPU -- **Bark**: Highest naturalness for premium narration; MIT licensed (full commercial freedom) -- **Kokoro**: Best balance of quality vs speed for offline use; Apache 2.0 licensed -- **Edge TTS**: Best for real-time, low-latency applications; cloud-based (Microsoft) -- **ElevenLabs**: Highest quality but requires paid API and credit card -- **For commercial purposes:** Use Bark (MIT), Chatterbox (MIT), or Kokoro (Apache 2.0) - -## Chatterbox Concurrency Analysis - -**Memory Safety**: Chatterbox uses a **shared model instance** (3.6GB) across all threads - **no duplication**. Safe to use 2-8 concurrent threads without OOM risk. - -**Performance**: ~2.8x speedup at 4 threads on CPU. Optimal thread count: **2-4 threads**. - -**[View Interactive Performance Analysis](assets/concurrency_plot.html)** - Shows speedup curves, memory usage, and timing breakdowns. - -## Commercial Licensing - -### ✅ Commercially Safe Models: -- **Bark** (MIT License) - Unrestricted commercial use, no attribution required ⭐ -- **Chatterbox** (MIT License) - Unrestricted commercial use, no attribution required -- **Qwen 3 TTS** (Apache 2.0) - Commercial use allowed, attribution required -- **Kokoro** (Apache 2.0) - Commercial use allowed, attribution required -- **Edge TTS** (Microsoft) - Commercial use allowed -- **ElevenLabs** (Paid API) - Commercial use with valid subscription - -## Provider Licenses +See [docs/cloning_and_design.md](docs/cloning_and_design.md) for advanced usage and troubleshooting. +``` -For transparency and compliance, here are direct links to the official license terms for each supported TTS provider: +Batch output lands in a timestamped folder (e.g. `20260227_123130_p100/`) with: +- `audio_001.wav`, `audio_002.wav`, … (one per input sentence) +- `manifest.json` — maps each filename to its source text and duration -- **Edge TTS (Microsoft)**: [Microsoft Terms of Use](https://www.microsoft.com/en-us/legal/terms-of-use) -- **Kokoro TTS**: [Apache License 2.0](https://github.com/hexgrad/kokoro/blob/main/LICENSE) -- **ElevenLabs TTS**: [ElevenLabs Terms of Service](https://elevenlabs.io/terms) -- **Bark TTS**: [MIT License](https://github.com/suno-ai/bark/blob/main/LICENSE) -- **Chatterbox TTS**: [MIT License](https://github.com/rsxdalv/chatterbox/blob/main/LICENSE) -- **Qwen 3 TTS**: [Apache License 2.0](https://github.com/QwenLM/Qwen3-TTS/blob/main/LICENSE) +--- -## Optional Dependencies +## Batch Processing (local providers) -Install optional TTS providers: +Pass multiple texts to any provider — processed concurrently with shared model instances: ```bash -# Install Kokoro TTS (self-hosted lightweight TTS) -pip install voicegenhub[kokoro] - -# Install Bark (self-hosted high-naturalness TTS) -pip install voicegenhub[bark] - -# Install Chatterbox TTS (MIT licensed, multilingual with emotion control) -pip install chatterbox-tts - -# Install Qwen 3 TTS (Apache 2.0 licensed, state-of-the-art multilingual TTS) -pip install voicegenhub[qwen] +poetry run voicegenhub synthesize "First." "Second." "Third." --provider edge --output batch_output ``` -### Kokoro TTS Installation -Kokoro TTS requires Python 3.11 or higher. - -#### Windows & Python 3.13+ Build Limitation +--- -**Important:** On Windows with Python 3.13+, Kokoro TTS (via curated-tokenizers) may require compiling native code if pre-built wheels are not available. This requires Microsoft Visual C++ Build Tools. - -If you see errors about missing C++ compilers or build failures when installing Kokoro, follow these steps: - -1. Download and install [Microsoft Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/). -2. During installation, select "Desktop development with C++" workload. -3. After installation, restart your terminal and retry installation: - ```bash - poetry install --with kokoro - # or - pip install voicegenhub[kokoro] - ``` - -If you still see build errors, check for available wheels for `curated-tokenizers` on [PyPI](https://pypi.org/project/curated-tokenizers/#files). If no wheel is available for your Python version, you must build from source (requires Visual C++). - -**Recommendation:** For easiest installation, use Python 3.11 or 3.12 on Windows until wheels for Python 3.13+ are published. - -#### Installation +## Voices ```bash -# Using Poetry (recommended): -poetry add voicegenhub[kokoro] -# or: -poetry install --with kokoro -``` +poetry run voicegenhub voices --language en --provider edge +poetry run voicegenhub voices --language en --provider kokoro +poetry run voicegenhub voices --language en --provider bark +poetry run voicegenhub voices --language en --provider chatterbox +poetry run voicegenhub voices --language en --provider elevenlabs +poetry run voicegenhub voices --provider qwen +``` + +### Qwen speakers + +| Speaker | Gender | Native language | Notes | +|---|---|---|---| +| `Ryan` | Male | English | Dynamic, rhythmic — works for all languages | +| `Aiden` | Male | English | Sunny American voice | +| `Vivian` | Female | Chinese | Bright, slightly edgy | +| `Serena` | Female | Chinese | Warm, gentle | +| `Uncle_Fu` | Male | Chinese | Low, mellow timbre | +| `Dylan` | Male | Chinese (Beijing) | Natural, youthful | +| `Eric` | Male | Chinese (Sichuan) | Slightly husky | +| `Ono_Anna` | Female | Japanese | Playful, nimble | +| `Sohee` | Female | Korean | Warm, emotional | + +--- + +## Key `synthesize` options + +| Flag | Description | +|---|---| +| `TEXT ...` | One or more texts to synthesize | +| `--provider` | TTS provider (`edge`, `kokoro`, `bark`, `chatterbox`, `qwen`, `elevenlabs`) | +| `--voice`, `-v` | Voice / speaker name | +| `--language`, `-l` | Language code (`en`, `zh`, `fr`, ...) | +| `--output`, `-o` | Output file or directory | +| `--gpu [p100|t4]` | Run on free Kaggle GPU (Qwen) | +| `--model`, `-m` | HuggingFace model ID override | +| `--output-dir` | Local directory for Kaggle batch output | +| `--timeout` | Kaggle polling timeout in minutes (default 60) | +| `--poll-interval` | Kaggle polling interval in seconds (default 60) | +| `--seed` | Random seed for reproducible generation (default 42) | +| `--temperature` | Sampling temperature (lower = more neutral, higher = more expressive; default 0.7) | +| `--exaggeration` | Chatterbox emotion intensity 0–1 | +| `--audio-prompt` | Reference audio for voice cloning (Chatterbox) | + +--- + +## Docs + +- [Installation & optional dependencies](docs/installation.md) +- [Provider details & voice lists](docs/providers.md) +- [Kaggle GPU setup](docs/kaggle_gpu.md) +- [Voice cloning & design](docs/cloning_and_design.md) +- [Benchmarks & performance](docs/benchmarks_and_performance.md) +- [Licensing](docs/licensing.md) diff --git a/docs/benchmarks_and_performance.md b/docs/benchmarks_and_performance.md new file mode 100644 index 0000000..f99a724 --- /dev/null +++ b/docs/benchmarks_and_performance.md @@ -0,0 +1,27 @@ +# Performance and Benchmarks + +VoiceGenHub is designed for both local CPU-only systems and GPU-accelerated environments. + +## Performance Comparison (Single Job) + +| Provider | Quality (MOS) | Startup Time | Sequential (per req) | Async (3x parallel) | Model Size | Commercial | +|----------|---------------|--------------|---------------------|-------------------|------------|------------| +| **Edge TTS** | 3.8/5 | 4.9s | 3.2s | 2.5s | 0MB (cloud) | ✅ Free | +| **Kokoro** | 3.5/5 | 94s | 14.2s | 2.5s | 625MB | ✅ Apache 2.0 | +| **Bark** | 4.2/5 | 180s | 25-40s | 8-12s | 4GB | ✅ MIT | +| **Chatterbox** | 4.3/5 | 120s | 15-30s | 5-15s | 3.7GB | ✅ MIT | +| **ElevenLabs** | 4.5/5* | 2s | 3-5s | 2-3s | 0MB (cloud) | ⚠️ Paid API | + +*ElevenLabs quality estimate based on reputation; not yet tested.* + +## Concurrency Analysis (Chatterbox) + +- **Memory Safety**: Chatterbox uses a **shared model instance** (3.6GB) across all threads — **no duplication**. +- **Performance**: ~2.8x speedup at 4 threads on CPU. Optimal thread count: **2-4 threads**. +- **Async Concurrency**: Safe to use 2-8 concurrent threads without OOM risk. + +## [View Concurrency Plot](assets/concurrency_plot.html) +Interactive performance analysis showing speedup curves, memory usage, and timing breakdowns. + +--- +*For more details on Kaggle GPU benchmarks, see the remote GPU documentation.* diff --git a/docs/cloning_and_design.md b/docs/cloning_and_design.md new file mode 100644 index 0000000..cf1e02e --- /dev/null +++ b/docs/cloning_and_design.md @@ -0,0 +1,51 @@ +# Voice Cloning and Design + +VoiceGenHub supports both zero-shot voice cloning (from audio samples) and voice design (from textual descriptions). + +## 1. Voice Cloning with [Chatterbox](https://github.com/rsxdalv/chatterbox) + +### Steps + +1. **Generate a Reference Audio** (or use an existing sample): + ```bash + voicegenhub synthesize "Sample text for cloning." \ + --provider kokoro \ + --voice kokoro-am_michael \ + --output reference.wav + ``` + +2. **Clone the Voice**: + ```bash + voicegenhub synthesize "Your text to be synthesized in the cloned voice." \ + --provider chatterbox \ + --audio-prompt reference.wav \ + --output cloned_voice.wav + ``` + +3. **Adjust Emotion and Style**: + ```bash + voicegenhub synthesize "Your text." \ + --provider chatterbox \ + --audio-prompt reference.wav \ + --exaggeration 0.8 \ + --cfg-weight 0.7 + ``` + +### Tips for Better Quality +- Use clear, noise-free reference audio (5-10 seconds recommended). +- Chatterbox supports **multilingual cloning** (clone any language, synthesize in any other language). + +## 2. Voice Design with [Qwen 3 TTS](https://github.com/QwenLM/Qwen3-TTS) + +*Requires `Qwen3-TTS-VoiceDesign` model for full control, available via Python API or remote GPU.* + +### Qwen 3 TTS Voice Design Features + +- **Natural Language Instruction**: Design custom voices using descriptions. +- **Example Voice Design**: + - `"Female, 25 years old, cheerful and energetic, slightly high-pitched with playful intonation"` + - `"Male, 17 years old, gaining confidence, deeper breath support, vowels tighten when nervous"` + - `"Elderly male, 70 years old, wise and gentle, slightly raspy with warm timbre"` + +--- +*For more details on Qwen 3 TTS design modes, see the [Qwen 3 TTS documentation](https://github.com/QwenLM/Qwen3-TTS).* diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..a7941d1 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,66 @@ +# Installation and Requirements + +Detailed installation guide for various TTS providers and optional features. + +## Basic Installation + +```bash +pip install voicegenhub +``` + +## Optional Provider Dependencies + +To use certain providers, you need to install their respective dependencies: + +```bash +# Kokoro TTS (Lightweight, self-hosted) +pip install voicegenhub[kokoro] + +# Bark TTS (High Quality, MIT) +pip install voicegenhub[bark] + +# Chatterbox TTS (High Quality, MIT) +pip install chatterbox-tts + +# Qwen 3 TTS (State-of-the-Art, Apache 2.0) +pip install voicegenhub[qwen] + +# ElevenLabs TTS (Commercial) +pip install elevenlabs +``` + +--- + +## 2. Dependencies + +### Voice Cloning Requirements (Chatterbox) + +For voice cloning features with Chatterbox TTS: + +```bash +pip install voicegenhub[voice-cloning] +``` + +**System Requirements:** +- **FFmpeg**: Required when `torchcodec` is installed for voice cloning. +- **PyTorch**: Required for local model execution. + +**Windows Installations**: Download the "full-shared" FFmpeg build from [ffmpeg.org](https://ffmpeg.org/download.html#build-windows) and add the `bin` directory to your system PATH. + +--- + +## Technical Note: CUDA and CPU Execution + +- VoiceGenHub automatically detects if a GPU is available. +- For **Chatterbox** and **Bark**, if no GPU is found, the library will fall back to **CPU execution**. +- For **Qwen 3 TTS**, high-quality models (1.7B) are recommended for **GPU acceleration** (remote or local). + +--- + +## Windows & Python 3.13+ (Kokoro) + +On Windows with Python 3.13+, **Kokoro TTS** may require Microsoft Visual C++ Build Tools for compilation if pre-built wheels are not available. + +1. Download [Microsoft Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/). +2. Select "Desktop development with C++" workload. +3. Restart terminal and retry installation. diff --git a/docs/kaggle_gpu.md b/docs/kaggle_gpu.md new file mode 100644 index 0000000..de4bcd9 --- /dev/null +++ b/docs/kaggle_gpu.md @@ -0,0 +1,52 @@ +# Kaggle Remote GPU Generation + +Generate high-quality Qwen3-TTS audio using remote Kaggle GPUs (P100 or T4x2). This is useful for high-quality 1.7B models when you don't have a local GPU. + +## Prerequisites + +1. **Kaggle API Credentials**: + - Go to [Kaggle Settings](https://www.kaggle.com/settings) → API → Create New Token. + - Save the `kaggle.json` to `~/.kaggle/kaggle.json` (on Windows: `%USERPROFILE%\.kaggle\kaggle.json`). +2. **Kaggle CLI**: + ```bash + pip install kaggle + ``` +3. **Kaggle Internet Access**: + - Ensure your Kaggle account has phone verification completed (allows internet access in kernels). + +## Usage + +Use the `--gpu` flag with the `synthesize` command to trigger remote generation. + +### P100 GPU (default) + +```bash +voicegenhub synthesize "Hello from the remote P100!" --gpu +``` + +### T4 x 2 GPU + +```bash +voicegenhub synthesize "Hello from the remote T4!" --gpu --gpu-type t4 +``` + +### Advanced Usage + +```bash +voicegenhub synthesize "Chinese test." \ + --gpu \ + --gpu-type p100 \ + --voice Serena \ + --language zh \ + --output ./remote_output/serena.wav +``` + +## How It Works + +1. **Automation**: VoiceGenHub generates a Jupyter notebook cell-by-cell. +2. **Deployment**: It pushes the notebook to Kaggle using the specified accelerator (`nvidia-p100-1` or `nvidia-t4-2`). +3. **Execution**: On Kaggle, the notebook installs necessary dependencies (`transformers`, `qwen-tts`), loads the model onto the GPU, and generates the audio. +4. **Syncing**: The CLI polls for completion and automatically downloads the generated `.wav` file into a local timestamped directory (or your specified output path). + +--- +*Note: Remote generation takes approximately 2-4 minutes due to environment setup on Kaggle's side.* diff --git a/docs/licensing.md b/docs/licensing.md new file mode 100644 index 0000000..ef0e9db --- /dev/null +++ b/docs/licensing.md @@ -0,0 +1,19 @@ +# Licensing and Commercial Usage + +VoiceGenHub is compatible with multiple free and commercial TTS licenses. + +## Commercially Safe Models (summary) +- **Bark** (MIT License) - Unrestricted commercial use, no attribution required. +- **Chatterbox** (MIT License) - Unrestricted commercial use, no attribution required. +- **Qwen 3 TTS** (Apache 2.0) - Commercial use allowed, attribution required. +- **Kokoro** (Apache 2.0) - Commercial use allowed, attribution required. +- **Edge TTS** (Microsoft) - Commercial use allowed. +- **ElevenLabs** (Paid API) - Commercial use with valid subscription. + +### Provider Licenses (links) +- **Edge TTS (Microsoft)**: [Microsoft Terms of Use](https://www.microsoft.com/en-us/legal/terms-of-use) +- **Kokoro TTS**: [Apache License 2.0](https://github.com/hexgrad/kokoro/blob/main/LICENSE) +- **ElevenLabs TTS**: [ElevenLabs Terms of Service](https://elevenlabs.io/terms) +- **Bark TTS**: [MIT License](https://github.com/suno-ai/bark/blob/main/LICENSE) +- **Chatterbox TTS**: [MIT License](https://github.com/rsxdalv/chatterbox/blob/main/LICENSE) +- **Qwen 3 TTS**: [Apache License 2.0](https://github.com/QwenLM/Qwen3-TTS/blob/main/LICENSE) diff --git a/docs/providers.md b/docs/providers.md new file mode 100644 index 0000000..9d15ca9 --- /dev/null +++ b/docs/providers.md @@ -0,0 +1,51 @@ +# TTS Providers Detail + +VoiceGenHub supports multiple free and commercial TTS providers. + +## [Chatterbox TTS](https://github.com/rsxdalv/chatterbox) (MIT) +Multilingual TTS with emotion control and voice cloning. + +### Features +- **Model selection via voice**: Choose between standard, turbo, or multilingual models. +- Emotion/intensity control with `exaggeration` parameter (0.0-1.0). +- Zero-shot voice cloning from audio samples. +- Built-in Perth watermarking for responsible AI. + +### Supported Languages +ar, da, de, el, en, es, fi, fr, he, hi, it, ja, ko, ms, nl, no, pl, pt, ru, sv, sw, tr, zh + +--- + +## [Qwen 3 TTS](https://github.com/QwenLM/Qwen3-TTS) (Apache 2.0) +State-of-the-art multilingual TTS with voice design and cloning. + +### Features +- **Three generation modes**: CustomVoice, VoiceDesign, VoiceClone. +- **10 languages**: Chinese, English, French, German, Italian, Japanese, Korean, Portuguese, Russian, Spanish. +- **Native speakers**: Automatic selection of native speakers per language. +- **Ultra-low latency**: Streaming generation supported. + +--- + +## [Bark TTS](https://github.com/suno-ai/bark) (MIT) +Self-hosted high-naturalness TTS with prosody control. + +### Features +- Prosody markers: `[laughs]`, `[sighs]`, `[pause]`, `[whisper]`. +- 100+ speaker presets. +- Sound effects generation. + +--- + +## [Kokoro TTS](https://github.com/hexgrad/kokoro) (Apache 2.0) +Self-hosted, extremely lightweight and fast. + +--- + +## [Microsoft Edge TTS](https://github.com/rany2/edge-tts) (Free Cloud) +Fast, high-quality cloud-based voices. + +--- + +## [ElevenLabs TTS](https://elevenlabs.io) (Commercial) +Premium high-quality voices (requires API key). diff --git a/pyproject.toml b/pyproject.toml index 12e019d..5feb3b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "voicegenhub" -version = "1.1.5" +version = "2.0.0" description = "Simple Text-to-Speech library supporting multiple providers" authors = ["leweex95 "] readme = "README.md" diff --git a/src/voicegenhub/cli.py b/src/voicegenhub/cli.py index 323f89e..721ff22 100644 --- a/src/voicegenhub/cli.py +++ b/src/voicegenhub/cli.py @@ -8,6 +8,7 @@ import sys import tempfile import threading +from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Optional @@ -45,6 +46,7 @@ def _process_single( instruct: Optional[str] = None, ref_audio: Optional[str] = None, ref_text: Optional[str] = None, + seed: Optional[int] = None, ): """Process a single text with effects support.""" try: @@ -52,6 +54,22 @@ def _process_single( tts = VoiceGenHub(provider=provider) asyncio.run(tts.initialize()) + # Build extra kwargs + extra_kwargs = dict( + exaggeration=exaggeration, + cfg_weight=cfg_weight, + ) + if audio_prompt_path: + extra_kwargs["audio_prompt_path"] = audio_prompt_path + if instruct: + extra_kwargs["instruct"] = instruct + if ref_audio: + extra_kwargs["ref_audio"] = ref_audio + if ref_text: + extra_kwargs["ref_text"] = ref_text + if seed is not None: + extra_kwargs["seed"] = seed + # Generate audio response = asyncio.run(tts.generate( text=text, @@ -60,12 +78,7 @@ def _process_single( audio_format=AudioFormat(audio_format), speed=speed, pitch=pitch, - exaggeration=exaggeration, - cfg_weight=cfg_weight, - audio_prompt_path=audio_prompt_path, - instruct=instruct, - ref_audio=ref_audio, - ref_text=ref_text, + **extra_kwargs, )) output_path = Path(output).resolve() if output else Path(".") / f"voicegenhub_output.{audio_format}" @@ -159,6 +172,7 @@ def _process_batch( instruct: Optional[str] = None, ref_audio: Optional[str] = None, ref_text: Optional[str] = None, + seed: Optional[int] = None, ): """Process multiple texts concurrently with provider-specific limits. @@ -215,6 +229,20 @@ def process_item(index: int, text: str): try: # Run async generation in thread async def generate(): + gen_kwargs = dict( + exaggeration=exaggeration, + cfg_weight=cfg_weight, + ) + if audio_prompt_path: + gen_kwargs["audio_prompt_path"] = audio_prompt_path + if instruct: + gen_kwargs["instruct"] = instruct + if ref_audio: + gen_kwargs["ref_audio"] = ref_audio + if ref_text: + gen_kwargs["ref_text"] = ref_text + if seed is not None: + gen_kwargs["seed"] = seed return await shared_tts.generate( text=text, voice=voice, @@ -222,12 +250,7 @@ async def generate(): audio_format=AudioFormat(audio_format), speed=speed, pitch=pitch, - exaggeration=exaggeration, - cfg_weight=cfg_weight, - audio_prompt_path=audio_prompt_path, - instruct=instruct, - ref_audio=ref_audio, - ref_text=ref_text, + **gen_kwargs, ) response = asyncio.run(generate()) @@ -255,6 +278,7 @@ async def generate(): instruct=instruct, ref_audio=ref_audio, ref_text=ref_text, + seed=seed, ) else: # Save output directly @@ -287,7 +311,6 @@ async def generate(): @click.group() def cli(): """VoiceGenHub - Simple Text-to-Speech CLI.""" - pass @cli.command() @@ -313,6 +336,16 @@ def cli(): "--pitch", type=float, default=1.0, help="Speech pitch (0.5-2.0, default 1.0)" ) @click.option("--provider", "-p", help="TTS provider") +@click.option( + "--gpu", + type=click.Choice(["p100", "t4"]), + help="Use remote Kaggle GPU for generation (currently Qwen3-TTS only)", +) +@click.option( + "--cpu", + is_flag=True, + help="Use local CPU for generation (default)", +) @click.option( "--lowpass", type=int, @@ -385,13 +418,117 @@ def cli(): type=str, help="Qwen 3 TTS: Reference text for voice cloning", ) +@click.option( + "--model", + "-m", + type=str, + default=None, + help="Qwen 3 TTS: HuggingFace model ID (e.g. Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)", +) +@click.option( + "--output-dir", + type=str, + default=None, + help="Kaggle GPU: Local directory for the downloaded audio (default: YYYYMMDD_HHMMSS_)", +) +@click.option( + "--output-filename", + type=str, + default="qwen3_tts.wav", + show_default=True, + help="Kaggle GPU: Filename for the generated audio file", +) +@click.option( + "--timeout", + type=int, + default=60, + show_default=True, + help="Kaggle GPU: Timeout in minutes to wait for the kernel", +) +@click.option( + "--poll-interval", + type=int, + default=60, + show_default=True, + help="Kaggle GPU: Status polling interval in seconds", +) +@click.option( + "--seed", + type=int, + default=42, + show_default=True, + help="Random seed for reproducible generation (local CPU and Kaggle GPU)", +) +@click.option( + "--temperature", + type=float, + default=0.7, + show_default=True, + help="Kaggle GPU: Sampling temperature (lower = more stable/neutral tone, higher = more expressive)", +) def synthesize( texts, voice, language, output, format, rate, pitch, provider, - lowpass, normalize, distortion, noise, reverb, pitch_shift, + gpu, cpu, lowpass, normalize, distortion, noise, reverb, pitch_shift, exaggeration, cfg_weight, audio_prompt, turbo, multilingual, - instruct, ref_audio, ref_text + instruct, ref_audio, ref_text, + model, output_dir, output_filename, timeout, poll_interval, seed, temperature, ): - """Generate speech from text(s).""" + """Generate speech from text(s). Use --gpu [p100|t4] for remote Kaggle GPU acceleration.""" + # Redirect to Kaggle pipeline if --gpu is specified + if gpu: + from .kaggle.pipeline import KaggleQwenPipeline + pipeline = KaggleQwenPipeline( + model_id=model or "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", + timeout_minutes=timeout, + poll_interval_seconds=poll_interval, + ) + + # Determine output directory + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + suffix = f"_{gpu}" + if output_dir: + resolved_output_dir = output_dir + elif output: + output_path_obj = Path(output) + resolved_output_dir = str(output_path_obj if not output_path_obj.suffix else output_path_obj.parent / f"{timestamp}{suffix}") + else: + resolved_output_dir = f"{timestamp}{suffix}" + + try: + result_paths = pipeline.run( + texts=list(texts), + voice=voice or "Ryan", + language=language or "en", + output_dir=resolved_output_dir, + gpu_type=gpu, + seed=seed, + temperature=temperature, + instruct=instruct or "", + ref_audio_path=audio_prompt or "", + ref_text=ref_text or "", + ) + click.echo(f"SUCCESS: {len(result_paths)} audio file(s) in: {Path(resolved_output_dir).absolute()}") + for p in result_paths: + click.echo(f" {p.name}") + manifest = Path(resolved_output_dir) / "manifest.json" + if manifest.exists(): + click.echo(" manifest.json (prompt→file mapping)") + return + except Exception as e: + click.echo(f"Error during remote generation: {e}", err=True) + sys.exit(1) + + # For local CPU runs, ensure directory structure matches requested format + if not output: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = f"{timestamp}_cpu" + os.makedirs(output_dir, exist_ok=True) + # For single text, we still want to respect the output_dir + if len(texts) == 1: + output = os.path.join(output_dir, "output.wav") + else: + output = os.path.join(output_dir, "batch") + # Validate provider immediately supported_providers = [ "edge", "kokoro", "elevenlabs", "bark", "chatterbox", "qwen" @@ -452,6 +589,7 @@ def synthesize( instruct=instruct, ref_audio=ref_audio, ref_text=ref_text, + seed=seed, ) else: # Single text processing (original behavior) @@ -476,6 +614,7 @@ def synthesize( instruct=instruct, ref_audio=ref_audio, ref_text=ref_text, + seed=seed, ) @@ -492,7 +631,7 @@ def synthesize( def voices(language: Optional[str], format: str, provider: str): """List available voices.""" # Validate provider immediately - supported_providers = ["edge", "kokoro", "elevenlabs", "bark", "chatterbox"] + supported_providers = ["edge", "kokoro", "elevenlabs", "bark", "chatterbox", "qwen"] if provider and provider not in supported_providers: click.echo( f"Error: Unsupported provider '{provider}'. Supported providers: {', '.join(supported_providers)}", diff --git a/src/voicegenhub/kaggle/__init__.py b/src/voicegenhub/kaggle/__init__.py new file mode 100644 index 0000000..7554829 --- /dev/null +++ b/src/voicegenhub/kaggle/__init__.py @@ -0,0 +1,5 @@ +"""Kaggle GPU pipeline for remote TTS generation.""" + +from .pipeline import KaggleQwenPipeline + +__all__ = ["KaggleQwenPipeline"] diff --git a/src/voicegenhub/kaggle/pipeline.py b/src/voicegenhub/kaggle/pipeline.py new file mode 100644 index 0000000..4ddab6c --- /dev/null +++ b/src/voicegenhub/kaggle/pipeline.py @@ -0,0 +1,764 @@ +""" +Kaggle GPU Pipeline for Qwen3-TTS Remote Generation. + +Pushes a notebook to Kaggle, runs it on a free P100 GPU, +polls for completion, and downloads the generated audio automatically. +""" + +import json +import os +import shutil +import subprocess +import sys +import tempfile +import textwrap +import time +import zipfile +from pathlib import Path +from typing import Optional + +from ..utils.logger import get_logger + +logger = get_logger(__name__) + +_DEFAULT_SETTINGS_PATH = Path(__file__).parent / "config" / "kaggle_settings.json" +_KERNEL_SLUG = "voicegenhub-qwen3-tts" + + +def _load_settings() -> dict: + """Load Kaggle pipeline settings from config JSON.""" + try: + with open(_DEFAULT_SETTINGS_PATH) as f: + return json.load(f) + except FileNotFoundError: + return { + "deployment_timeout_minutes": 60, + "polling_interval_seconds": 60, + "retry_interval_seconds": 60, + } + + +def _detect_kaggle_username() -> str: + """Detect Kaggle username from credentials or env.""" + # 1. Environment variable + if os.environ.get("KAGGLE_USERNAME"): + return os.environ["KAGGLE_USERNAME"] + + # 2. ~/.kaggle/kaggle.json + kaggle_json = Path.home() / ".kaggle" / "kaggle.json" + if kaggle_json.exists(): + try: + with open(kaggle_json) as f: + creds = json.load(f) + return creds.get("username", "") + except Exception: + pass + + raise RuntimeError( + "Kaggle username not found. Set KAGGLE_USERNAME env var " + "or ensure ~/.kaggle/kaggle.json exists with 'username' field." + ) + + +def _build_notebook_source( + texts: list, + voice: str, + language: str, + model_id: str, + dtype: str, + seed: int = 42, + temperature: float = 0.7, + instruct: str = "", + ref_audio_kernel_path: str = "", + ref_text: str = "", +) -> dict: + """Build the Jupyter notebook content for Kaggle GPU batch execution. + + Generates one audio file per text entry (audio_001.wav, audio_002.wav, …) + and writes a manifest.json that maps each filename to its source text. + + When *ref_audio_kernel_path* is non-empty (e.g. + ``/kaggle/input/voicegenhub-ref-audio/levi_voice.wav``) the notebook calls + ``generate_voice_clone()`` using that file as the reference speaker. When + *instruct* is also provided it is forwarded to the clone call for + style/emotion control. When only *instruct* is set, the named VOICE + speaker is used via ``generate_custom_voice(instruct=…)``. + """ + + # Language mapping (CLI code → Qwen language string) + language_map = { + "en": "English", + "zh": "Chinese", + "fr": "French", + "de": "German", + "it": "Italian", + "ja": "Japanese", + "ko": "Korean", + "pt": "Portuguese", + "ru": "Russian", + "es": "Spanish", + } + qwen_language = language_map.get(language.lower(), "English") + + install_code = textwrap.dedent("""\ + import subprocess, sys + + def pip_install(*packages): + subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *packages]) + + pip_install("transformers>=4.40.0", "accelerate>=0.27.0", "tokenizers") + pip_install("qwen-tts") + try: + pip_install("flash-attn", "--no-cache-dir") + except Exception as e: + print(f"flash-attn install skipped (non-fatal): {e}") + pip_install("soundfile") + """) + + # Embed the texts list and metadata directly into the notebook cell + gen_code = textwrap.dedent(f"""\ + import json + import torch + import soundfile as sf + from qwen_tts import Qwen3TTSModel + + MODEL_ID = {json.dumps(model_id)} + VOICE = {json.dumps(voice)} + LANGUAGE = {json.dumps(qwen_language)} + TEXTS = {json.dumps(texts)} + OUTPUT_DIR = "/kaggle/working" + SEED = {seed} + TEMPERATURE = {temperature} + INSTRUCT = {json.dumps(instruct)} + REF_AUDIO_PATH = {json.dumps(ref_audio_kernel_path)} + REF_TEXT = {json.dumps(ref_text)} + + # Pin global seed for reproducibility across runs + torch.manual_seed(SEED) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(SEED) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + print(f"CUDA available: {{torch.cuda.is_available()}}") + if torch.cuda.is_available(): + print(f"GPU: {{torch.cuda.get_device_name(0)}}") + print(f"Seed: {{SEED}} Temperature: {{TEMPERATURE}}") + if INSTRUCT: + print(f"Instruct: {{INSTRUCT}}") + if REF_AUDIO_PATH: + print(f"Voice clone mode: reference audio at {{REF_AUDIO_PATH}}") + + # Reference audio path set directly from Kaggle dataset input + _ref_audio_path = REF_AUDIO_PATH if REF_AUDIO_PATH else None + + print(f"Loading model: {{MODEL_ID}}") + model = Qwen3TTSModel.from_pretrained( + MODEL_ID, + device_map="cuda:0" if torch.cuda.is_available() else "cpu", + dtype=torch.float16, + ) + + # Guard: verify the loaded model supports voice cloning before entering the loop + if _ref_audio_path: + _mt = getattr(model.model, 'tts_model_type', 'unknown') + if _mt != 'base': + raise ValueError( + "Voice cloning requires tts_model_type='base' but got: " + str(_mt) + + ". MODEL_ID=" + MODEL_ID + " does not support generate_voice_clone(). " + "Switch to a Qwen3-TTS base model (e.g. Qwen/Qwen3-TTS-12Hz-1.7B-Base)." + ) + + manifest = [] + for i, text in enumerate(TEXTS, start=1): + filename = f"audio_{{i:03d}}.wav" + out_path = f"{{OUTPUT_DIR}}/{{filename}}" + print(f"[{{i}}/{{len(TEXTS)}}] Generating: {{text[:80]}}") + # Re-seed before each text so every audio is independently reproducible + torch.manual_seed(SEED + i) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(SEED + i) + if _ref_audio_path: + # non_streaming_mode=True is required — the default (False) simulates + # streaming and does not terminate properly for single non-streaming calls, + # causing runaway generation (e.g. 10+ minutes of garbage audio). + clone_kwargs = dict( + text=text, + language=LANGUAGE, + ref_audio=_ref_audio_path, + temperature=TEMPERATURE, + top_p=0.9, + repetition_penalty=1.1, + non_streaming_mode=True, + ) + if REF_TEXT: + # Strip trailing ellipsis — ICL mode requires ref_text to match + # the actual audio content; truncated text causes runaway generation. + _clean_ref_text = REF_TEXT.rstrip(". ").rstrip("…").rstrip(".") + clone_kwargs["ref_text"] = _clean_ref_text + if INSTRUCT: + clone_kwargs["instruct"] = INSTRUCT + try: + wavs, sr = model.generate_voice_clone(**clone_kwargs) + except TypeError as _e: + if INSTRUCT and "instruct" in str(_e): + print("Note: instruct not supported in clone mode, retrying without: " + str(_e)) + del clone_kwargs["instruct"] + wavs, sr = model.generate_voice_clone(**clone_kwargs) + else: + raise + else: + gen_kwargs = dict( + text=text, + language=LANGUAGE, + speaker=VOICE, + temperature=TEMPERATURE, + top_p=0.9, + repetition_penalty=1.1, + ) + if INSTRUCT: + gen_kwargs["instruct"] = INSTRUCT + wavs, sr = model.generate_custom_voice(**gen_kwargs) + sf.write(out_path, wavs[0], sr) + duration = len(wavs[0]) / sr + print(f" -> {{filename}} ({{duration:.2f}}s @ {{sr}} Hz)") + manifest.append({{"index": i, "file": filename, "text": text, "duration_sec": round(duration, 2)}}) + + manifest_path = f"{{OUTPUT_DIR}}/manifest.json" + with open(manifest_path, "w", encoding="utf-8") as f: + json.dump(manifest, f, ensure_ascii=False, indent=2) + + print(f"\\nDone — {{len(TEXTS)}} audio files + manifest.json written to {{OUTPUT_DIR}}") + for entry in manifest: + print(f" {{entry['file']}} {{entry['duration_sec']}}s {{entry['text'][:60]}}") + """) + + summary_lines = [f"- `audio_{i:03d}.wav`: {t[:80]}{'…' if len(t) > 80 else ''}\n" for i, t in enumerate(texts, 1)] + notebook = { + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3", + }, + "language_info": {"name": "python", "version": "3.10.0"}, + }, + "cells": [ + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "# VoiceGenHub — Qwen3-TTS GPU Batch Generation\n", + f"**Model:** `{model_id}` **Voice:** {voice} **Language:** {qwen_language}\n\n", + f"**{len(texts)} texts to synthesize:**\n", + ] + summary_lines, + }, + { + "cell_type": "code", + "execution_count": None, + "id": "install", + "metadata": {}, + "outputs": [], + "source": install_code.splitlines(keepends=True), + }, + { + "cell_type": "code", + "execution_count": None, + "id": "generate", + "metadata": {}, + "outputs": [], + "source": gen_code.splitlines(keepends=True), + }, + ], + } + return notebook + + +def _upload_ref_audio_dataset(audio_file: Path, username: str) -> str: + """Upload *audio_file* to a private Kaggle dataset and return the slug. + + Uses the stable slug ``voicegenhub-ref-audio``. If the dataset does not + yet exist the first call creates it; subsequent calls push a new version, + making this fully idempotent from the caller's perspective. + """ + dataset_slug = "voicegenhub-ref-audio" + with tempfile.TemporaryDirectory() as ds_dir: + ds_path = Path(ds_dir) + shutil.copy2(audio_file, ds_path / audio_file.name) + ds_meta = { + "title": "VoiceGenHub Reference Audio", + "id": f"{username}/{dataset_slug}", + "licenses": [{"name": "other"}], + } + (ds_path / "dataset-metadata.json").write_text(json.dumps(ds_meta)) + # Try updating an existing version first; fall back to creating from scratch. + try: + result = _run_cmd( + ["kaggle", "datasets", "version", "-p", ds_dir, + "-m", "voicegenhub ref audio update", "-q"], + capture=True, check=True, + ) + logger.info(f"Reference audio dataset version pushed: {result.stdout.strip()}") + except subprocess.CalledProcessError: + try: + result = _run_cmd( + ["kaggle", "datasets", "create", "-p", ds_dir, "-q"], + capture=True, check=True, + ) + logger.info(f"Reference audio dataset created: {result.stdout.strip()}") + except subprocess.CalledProcessError as exc: + raise RuntimeError( + f"Failed to upload reference audio as Kaggle dataset.\n" + f"stdout: {exc.stdout.strip()}\nstderr: {exc.stderr.strip()}" + ) from exc + return dataset_slug + + +def _build_kernel_metadata( + username: str, + kernel_slug: str, + notebook_filename: str, + gpu_type: str = "p100", + dataset_sources: Optional[list] = None, +) -> dict: + """Build Kaggle kernel-metadata.json. + + IMPORTANT: The 'title' must slugify to exactly the same value as the slug + portion of the 'id' field. Kaggle derives the kernel slug from the title + (spaces→hyphens, lowercase) and ignores the 'id' slug portion on creation. + When they differ, every subsequent push hits a 409 Conflict because Kaggle + already owns the title-derived slug. Keep title = "VoiceGenHub Qwen3 TTS" + so it slugifies to "voicegenhub-qwen3-tts", matching _KERNEL_SLUG. + """ + return { + "id": f"{username}/{kernel_slug}", + "title": "VoiceGenHub Qwen3 TTS", + "code_file": notebook_filename, + "language": "python", + "kernel_type": "notebook", + "is_private": True, + "enable_gpu": True, + "enable_tpu": False, + "enable_internet": True, + "dataset_sources": dataset_sources or [], + "competition_sources": [], + "kernel_sources": [], + "model_sources": [], + } + + +def _resolve_kaggle_executable() -> str: + """ + Resolve the 'kaggle' CLI executable. + + Priority: + 1. Same directory as the current Python executable (venv Scripts/) + 2. System PATH + """ + python_dir = Path(sys.executable).parent + for candidate in ("kaggle.exe", "kaggle"): + path = python_dir / candidate + if path.exists(): + return str(path) + + # Fall back to PATH + found = shutil.which("kaggle") + if found: + return found + + raise FileNotFoundError( + "Could not find the 'kaggle' CLI executable. " + "Install it with: pip install kaggle" + ) + + +def _extract_kernel_id_from_push(push_stdout: str, fallback: str) -> str: + """ + Extract the actual kernel ID from the push output. + + The push command prints something like: + "Kernel version 1 successfully pushed. Please check progress at + https://www.kaggle.com/code/leventecsibi/my-kernel-slug" + + We parse the URL path to get the actual slug Kaggle used. + """ + import re + match = re.search(r"kaggle\.com/code/([^/\s]+/[^/\s]+)", push_stdout) + if match: + return match.group(1) + return fallback + + +def _run_cmd(args, capture=True, check=True): + """Run a shell command. Resolves 'kaggle' to the correct venv executable.""" + resolved = list(args) + if resolved and resolved[0] == "kaggle": + resolved[0] = _resolve_kaggle_executable() + logger.debug(f"Running: {' '.join(str(a) for a in resolved)}") + result = subprocess.run( + resolved, + capture_output=capture, + text=True, + check=check, + ) + return result + + +class KaggleQwenPipeline: + """ + End-to-end pipeline: generate Qwen3-TTS audio on Kaggle P100 GPU + and download the result locally. + + Workflow: + 1. Build a Jupyter notebook with the user's text/voice/model parameters. + 2. Push it to Kaggle with GPU enabled (P100). + 3. Poll until the kernel finishes. + 4. Download the output `.wav` file. + 5. Place it into a timestamped output directory. + """ + + def __init__( + self, + model_id: str = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", + dtype: str = "float16", + kernel_slug: str = _KERNEL_SLUG, + settings_path: Optional[Path] = None, + timeout_minutes: Optional[int] = None, + poll_interval_seconds: Optional[int] = None, + ): + self.model_id = model_id + self.dtype = dtype + self.kernel_slug = kernel_slug + self._settings = _load_settings() if settings_path is None else json.loads(Path(settings_path).read_text()) + # CLI-provided values take precedence over settings file + self._timeout_minutes = timeout_minutes if timeout_minutes is not None else self._settings.get("deployment_timeout_minutes", 60) + self._poll_interval = poll_interval_seconds if poll_interval_seconds is not None else self._settings.get("polling_interval_seconds", 60) + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def run( + self, + texts, + voice: str = "Ryan", + language: str = "en", + output_dir: Optional[str] = None, + gpu_type: str = "p100", + seed: int = 42, + temperature: float = 0.7, + instruct: str = "", + ref_audio_path: str = "", + ref_text: str = "", + ) -> list: + """ + Run the full Kaggle Qwen3-TTS batch pipeline. + + Args: + texts: A single text string or a list of text strings to synthesize. + Each text produces one audio file (audio_001.wav, …). + voice: Speaker name (e.g. "Ryan", "Serena"). Ignored when + *ref_audio_path* is provided (voice cloning mode). + language: ISO language code (e.g. "en", "zh"). + output_dir: Local directory for all downloaded files. + Defaults to a timestamped folder in the cwd. + gpu_type: Kaggle accelerator type ("p100", "t4"). + seed: Random seed for reproducible generation. + temperature: Sampling temperature. + instruct: Qwen3 instruct string for style/emotion control. Works + in both custom-voice and voice-clone modes. + ref_audio_path: Local path to a reference WAV file for voice + cloning. The file is uploaded once as a private + Kaggle dataset (``voicegenhub-ref-audio``) and + attached as a data source so the kernel can read + it from ``/kaggle/input/`` without any notebook + size inflation. + ref_text: Optional transcript of the reference audio. Improves + clone quality when provided. + + Returns: + List of Paths to the downloaded audio files. + A manifest.json is written alongside the audio files linking each + filename to its source text. + """ + # Normalise: accept both str and list[str] + if isinstance(texts, str): + texts = [texts] + + # Guard: CustomVoice model variants do not support generate_voice_clone. + # Fail early with a clear message rather than a cryptic Kaggle kernel error. + if ref_audio_path and "CustomVoice" in self.model_id: + raise ValueError( + f"Voice cloning (--audio-prompt) is not supported by '{self.model_id}'.\n" + "CustomVoice variants only provide predefined speaker voices.\n" + "To clone a reference voice you must use a Qwen3-TTS *base* model.\n" + "Pass: --model Qwen/Qwen3-TTS-12Hz-1.7B-Base" + ) + + username = _detect_kaggle_username() + kernel_id = f"{username}/{self.kernel_slug}" + + if output_dir is None: + from datetime import datetime + output_dir = datetime.now().strftime("%Y%m%d_%H%M%S") + f"_{gpu_type}" + + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + logger.info( + "Starting Kaggle Qwen3-TTS batch pipeline", + kernel_id=kernel_id, + model=self.model_id, + voice=voice, + language=language, + gpu_type=gpu_type, + num_texts=len(texts), + ) + + # 1. Build notebook + push + with tempfile.TemporaryDirectory() as tmpdir: + notebook_filename = "qwen3_tts.ipynb" + notebook_path = Path(tmpdir) / notebook_filename + metadata_path = Path(tmpdir) / "kernel-metadata.json" + + # ---------------------------------------------------------------- + # Upload reference audio as a private Kaggle dataset so the + # notebook can read it from /kaggle/input/ without any + # notebook-cell-size issues. + # ---------------------------------------------------------------- + ref_dataset_slug = None # set when a reference audio is provided + if ref_audio_path: + ref_audio_file = Path(ref_audio_path) + if not ref_audio_file.exists(): + raise FileNotFoundError( + f"Reference audio file not found: {ref_audio_path}" + ) + ref_dataset_slug = _upload_ref_audio_dataset( + ref_audio_file, username + ) + logger.info( + f"Reference audio dataset ready: {username}/{ref_dataset_slug}" + ) + + # Determine ref audio kernel path and dataset sources for metadata + ref_audio_kernel_path = "" + kernel_dataset_sources = [] + if ref_dataset_slug: + ref_audio_file = Path(ref_audio_path) + ref_audio_kernel_path = ( + f"/kaggle/input/{ref_dataset_slug}/{ref_audio_file.name}" + ) + kernel_dataset_sources = [f"{username}/{ref_dataset_slug}"] + + notebook = _build_notebook_source( + texts=texts, + voice=voice, + language=language, + model_id=self.model_id, + dtype=self.dtype, + seed=seed, + temperature=temperature, + instruct=instruct, + ref_audio_kernel_path=ref_audio_kernel_path, + ref_text=ref_text or "", + ) + notebook_path.write_text(json.dumps(notebook, indent=2)) + + # Save a copy of the submitted notebook to the output folder for traceability + submitted_nb_dest = output_path / "submitted_notebook.ipynb" + submitted_nb_dest.write_text(json.dumps(notebook, indent=2)) + logger.info(f"Submitted notebook saved: {submitted_nb_dest}") + + metadata = _build_kernel_metadata( + username, self.kernel_slug, notebook_filename, + gpu_type=gpu_type, + dataset_sources=kernel_dataset_sources, + ) + metadata_path.write_text(json.dumps(metadata, indent=2)) + + logger.info(f"Pushing kernel to Kaggle: {kernel_id} (accelerator: {gpu_type})") + try: + acc_flag = "nvidia-p100-1" if gpu_type == "p100" else "nvidia-t4-2" + push_result = _run_cmd( + ["kaggle", "kernels", "push", "-p", tmpdir, "--accelerator", acc_flag], + capture=True, + check=True, + ) + push_out = push_result.stdout.strip() + logger.info(f"Push result: {push_out}") + + actual_kernel_id = _extract_kernel_id_from_push(push_out, kernel_id) + if actual_kernel_id != kernel_id: + logger.info( + f"Kaggle resolved kernel slug: {actual_kernel_id} " + f"(metadata had: {kernel_id})" + ) + kernel_id = actual_kernel_id + except subprocess.CalledProcessError as exc: + raise RuntimeError( + f"kaggle kernels push failed (exit {exc.returncode}).\n" + f"stdout: {exc.stdout.strip()}\n" + f"stderr: {exc.stderr.strip()}" + ) from exc + + # 2. Poll until done + self._poll_until_complete(kernel_id) + + # 3. Download all output files (audio_*.wav + manifest.json) + local_files = self._download_output(kernel_id, output_path, len(texts)) + + logger.info( + "Kaggle Qwen3-TTS batch pipeline complete", + output_dir=str(output_path), + num_files=len(local_files), + ) + return local_files + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _poll_until_complete(self, kernel_id: str) -> None: + """Poll Kaggle kernel status until it completes or times out.""" + timeout_seconds = self._timeout_minutes * 60 + elapsed = 0 + + logger.info( + f"Polling kernel status (timeout: {self._timeout_minutes}m, " + f"interval: {self._poll_interval}s)…", + kernel_id=kernel_id, + ) + + while elapsed < timeout_seconds: + try: + result = _run_cmd( + ["kaggle", "kernels", "status", kernel_id], + capture=True, + check=True, + ) + status_line = result.stdout.strip() + logger.info(f"Kernel status: {status_line}") + + status_lower = status_line.lower() + if "complete" in status_lower: + logger.info("Kernel finished successfully.") + return + elif "error" in status_lower or "cancel" in status_lower: + raise RuntimeError( + f"Kaggle kernel ended with non-successful status: {status_line}\n" + "Check the kernel logs at https://www.kaggle.com/code" + ) + except subprocess.CalledProcessError as e: + logger.warning(f"Status check failed: {e.stderr.strip()}, retrying…") + + time.sleep(self._poll_interval) + elapsed += self._poll_interval + + raise TimeoutError( + f"Kaggle kernel did not complete within {self._timeout_minutes} minutes. " + f"Check manually: https://www.kaggle.com/code/{kernel_id}" + ) + + def _download_output( + self, + kernel_id: str, + output_path: Path, + num_texts: int, + ) -> list: + """Download all kernel outputs (audio_*.wav + manifest.json) to output_path. + + Returns a list of Path objects for each downloaded audio file. + Retries up to 3 times with a 30-second delay to handle the case where + Kaggle marks a kernel COMPLETE before output files are fully staged. + """ + max_retries = 3 + retry_delay_seconds = 30 + + with tempfile.TemporaryDirectory() as dl_dir: + dl_path = Path(dl_dir) + wav_files = [] + + for attempt in range(1, max_retries + 1): + logger.info(f"Downloading kernel outputs from {kernel_id}… (attempt {attempt}/{max_retries})") + try: + _run_cmd( + ["kaggle", "kernels", "output", kernel_id, "-p", dl_dir], + capture=True, + check=True, + ) + except subprocess.CalledProcessError as exc: + logger.warning(f"Download attempt {attempt} failed (exit {exc.returncode}): {exc.stderr}") + if attempt < max_retries: + logger.info(f"Retrying in {retry_delay_seconds}s…") + time.sleep(retry_delay_seconds) + continue + raise + + # Extract any zip archives first + for zf in list(dl_path.rglob("*.zip")): + logger.info(f"Extracting {zf.name}…") + with zipfile.ZipFile(zf, "r") as z: + z.extractall(dl_path) + + wav_files = sorted(dl_path.rglob("*.wav")) + if wav_files: + break + + # Kaggle sometimes returns COMPLETE before output files are staged + if attempt < max_retries: + logger.warning( + f"No .wav files found on attempt {attempt} — " + f"Kaggle output may not be staged yet. Retrying in {retry_delay_seconds}s…" + ) + time.sleep(retry_delay_seconds) + + manifest_files = list(dl_path.rglob("manifest.json")) + + # Always copy logs and executed notebook — survive even if no wavs + for log_file in dl_path.rglob("*.log"): + dest = output_path / log_file.name + shutil.copy2(log_file, dest) + logger.info(f"Kernel log saved: {dest} ({dest.stat().st_size:,} bytes)") + + for nb_file in dl_path.rglob("*.ipynb"): + dest = output_path / "executed_notebook.ipynb" + shutil.copy2(nb_file, dest) + logger.info(f"Executed notebook saved: {dest} ({dest.stat().st_size:,} bytes)") + + if not wav_files: + all_files = list(dl_path.rglob("*")) + file_list = ", ".join(f.name for f in all_files if f.is_file()) + raise FileNotFoundError( + f"No .wav files found in kernel output. Downloaded files: {file_list}\n" + f"Check kernel logs: https://www.kaggle.com/code/{kernel_id}" + ) + + # Copy all wav files + local_wavs = [] + for wav in wav_files: + dest = output_path / wav.name + shutil.copy2(wav, dest) + logger.info(f"Audio saved locally: {dest} ({dest.stat().st_size:,} bytes)") + local_wavs.append(dest) + + # Copy manifest.json if present + if manifest_files: + manifest_dest = output_path / "manifest.json" + shutil.copy2(manifest_files[0], manifest_dest) + logger.info(f"Manifest saved locally: {manifest_dest}") + else: + # Generate a minimal fallback manifest + manifest_dest = output_path / "manifest.json" + fallback = [ + {"index": i + 1, "file": wav.name, "text": f"(text {i + 1} of {num_texts})"} + for i, wav in enumerate(local_wavs) + ] + manifest_dest.write_text( + json.dumps(fallback, ensure_ascii=False, indent=2), encoding="utf-8" + ) + logger.info(f"Fallback manifest written: {manifest_dest}") + + return local_wavs diff --git a/src/voicegenhub/providers/qwen.py b/src/voicegenhub/providers/qwen.py index 4a36e9b..585d559 100644 --- a/src/voicegenhub/providers/qwen.py +++ b/src/voicegenhub/providers/qwen.py @@ -27,6 +27,22 @@ logger = get_logger(__name__) +# Speaker metadata: name -> (language, locale, gender, description). +# The model's get_supported_speakers() only returns names; this dict supplies +# the additional info needed to build a Voice object. New speakers returned +# by the model but absent here fall back to neutral/multilingual defaults. +_SPEAKER_META: Dict[str, tuple] = { + "Ryan": ("en", "en-US", VoiceGender.MALE, "Dynamic male, strong rhythmic drive — English native"), + "Aiden": ("en", "en-US", VoiceGender.MALE, "Sunny American male, clear midrange — English native"), + "Vivian": ("zh", "zh-CN", VoiceGender.FEMALE, "Bright, slightly edgy young female — Chinese native"), + "Serena": ("zh", "zh-CN", VoiceGender.FEMALE, "Warm, gentle young female — Chinese native"), + "Uncle_Fu": ("zh", "zh-CN", VoiceGender.MALE, "Seasoned male, low mellow timbre — Chinese native"), + "Dylan": ("zh", "zh-CN", VoiceGender.MALE, "Youthful Beijing male, natural timbre — Chinese native"), + "Eric": ("zh", "zh-CN", VoiceGender.MALE, "Lively Chengdu male, slightly husky — Chinese native"), + "Ono_Anna": ("ja", "ja-JP", VoiceGender.FEMALE, "Playful female, light and nimble — Japanese native"), + "Sohee": ("ko", "ko-KR", VoiceGender.FEMALE, "Warm female with rich emotion — Korean native"), +} + class QwenTTSProvider(TTSProvider): """ @@ -173,51 +189,55 @@ async def initialize(self) -> None: provider=self.provider_id, ) from e - async def get_voices(self) -> List[Voice]: - """Get available voices based on generation mode.""" - await self.initialize() - - voices = [] + async def get_voices(self, language: Optional[str] = None) -> List[Voice]: + """Return Qwen3-TTS CustomVoice speakers by querying the loaded model. - if self.generation_mode == "custom_voice": - # Get supported speakers - try: - speakers = self._model.model.get_supported_speakers() - if speakers: - for speaker in speakers: - voices.append( - Voice( - id=speaker, - name=speaker.capitalize(), - language="multilingual", - locale="mul", - gender=VoiceGender.NEUTRAL, - voice_type=VoiceType.NEURAL, - provider=self.provider_id, - sample_rate=24000, - description=f"Qwen 3 TTS CustomVoice speaker: {speaker}", - ) - ) - except Exception as e: - logger.warning(f"Could not get speakers: {e}") - - if not voices: - # Return generic voice entries for other modes - voices.append( - Voice( - id="default", - name="Default Voice", - language="multilingual", - locale="mul", - gender=VoiceGender.NEUTRAL, - voice_type=VoiceType.NEURAL, - provider=self.provider_id, - sample_rate=24000, - description=f"Qwen 3 TTS {self.generation_mode} mode", + Speakers are enriched with language/gender metadata from _SPEAKER_META. + Speakers not in _SPEAKER_META fall back to neutral/multilingual defaults. + If *language* is provided (e.g. 'en', 'zh', 'ja', 'ko'), only voices whose + native language matches are returned. If no match, the full list is returned. + """ + await self.initialize() + speakers = self._model.model.get_supported_speakers() or [] + + if not speakers: + return [Voice( + id="default", + name="Default", + language="multilingual", + locale="multilingual", + gender=VoiceGender.NEUTRAL, + voice_type=VoiceType.NEURAL, + provider="qwen", + )] + + voices: List[Voice] = [] + for speaker in speakers: + meta = _SPEAKER_META.get(speaker) + if meta: + lang, locale, gender, desc = meta + else: + lang, locale, gender, desc = ( + "multilingual", "multilingual", VoiceGender.NEUTRAL, + f"{speaker} speaker", ) - ) - - return voices + voices.append(Voice( + id=speaker, + name=speaker, + language=lang, + locale=locale, + gender=gender, + voice_type=VoiceType.NEURAL, + provider="qwen", + description=desc, + )) + + if language is None: + return voices + + lang_filter = language.lower().split("-")[0] # normalise "en-US" → "en" + filtered = [v for v in voices if v.language == lang_filter] + return filtered if filtered else voices def _get_native_speaker_for_language(self, language: str) -> str: """Get native speaker for a given language.""" @@ -272,6 +292,15 @@ async def synthesize(self, request: TTSRequest) -> TTSResponse: } generate_kwargs.update(request.extra_params) + # Seed support: pin torch seed before generation for reproducibility + seed = generate_kwargs.pop("seed", None) + if seed is not None: + seed = int(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + logger.info(f"Pinned random seed: {seed}") + # Generate based on mode if self.generation_mode == "custom_voice": speaker = generate_kwargs.pop("speaker", self.default_speaker or request.voice_id) @@ -323,8 +352,9 @@ async def synthesize(self, request: TTSRequest) -> TTSResponse: ref_audio = generate_kwargs.pop("ref_audio", self.default_ref_audio) ref_text = generate_kwargs.pop("ref_text", self.default_ref_text) x_vector_only = generate_kwargs.pop("x_vector_only_mode", self.x_vector_only_mode) + instruct = generate_kwargs.pop("instruct", self.default_instruct) - wavs, sample_rate = self._model.generate_voice_clone( + clone_call_kwargs = dict( text=request.text, language=language, ref_audio=ref_audio, @@ -333,6 +363,11 @@ async def synthesize(self, request: TTSRequest) -> TTSResponse: non_streaming_mode=self.non_streaming_mode, **generate_kwargs ) + if instruct: + clone_call_kwargs["instruct"] = instruct + logger.info(f"Voice clone + instruct: '{instruct}'") + + wavs, sample_rate = self._model.generate_voice_clone(**clone_call_kwargs) else: raise TTSError( diff --git a/src/voicegenhub/utils/compatibility.py b/src/voicegenhub/utils/compatibility.py index 11048eb..30923a4 100644 --- a/src/voicegenhub/utils/compatibility.py +++ b/src/voicegenhub/utils/compatibility.py @@ -7,6 +7,7 @@ logger = get_logger(__name__) + def apply_cpu_compatibility_patches(): """Apply patches to ensure stability on CPU-only environments.""" @@ -29,7 +30,9 @@ def apply_cpu_compatibility_patches(): mock_codec.__version__ = "0.9.1" mock_codec.__spec__ = ModuleSpec("torchcodec", None) - class Frame: pass + class Frame: + pass + class Decoder: def __init__(self, *args, **kwargs): pass @@ -62,6 +65,7 @@ def patched_version(package_name): if "torch" in sys.modules: _patch_torch_cuda(sys.modules["torch"]) + def _patch_torch_cuda(torch): """Specific patches for torch when it's already loaded.""" if not torch.cuda.is_available(): @@ -78,6 +82,7 @@ def _patch_torch_cuda(torch): except Exception: pass + def ensure_torchcodec(): """Specific check for torchcodec to satisfy Transformers >= 4.51.""" try: