From 40171cb6ff4b7e85b929bde146d670c102913b3d Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Mon, 18 May 2026 16:51:10 +0900 Subject: [PATCH 01/11] Add HTS label support infrastructure --- OpenUtau.Core/Hts/HTSLabelPhonemizer.cs | 675 +++++++++++++++ OpenUtau.Core/Hts/HTSLabelRenderer.cs | 484 +++++++++++ OpenUtau.Core/Render/RenderPhrase.cs | 17 +- OpenUtau.Core/Util/HTS.cs | 770 ++++++++++++++++++ .../Util}/HTSLabelFile.cs | 4 +- .../Util}/Merlin.cs | 5 +- .../Util}/Python.cs | 2 +- .../Util}/Scaler.cs | 2 +- .../EnunuOnnx/EnunuOnnxPhonemizer.cs | 70 +- OpenUtau.Plugin.Builtin/EnunuOnnx/HTS.cs | 256 ------ OpenUtau.Test/Core/Util/HtsSpecTests.cs | 311 +++++++ .../Plugins/HtsLabelPhonemizerTest.cs | 242 ++++++ 12 files changed, 2548 insertions(+), 290 deletions(-) create mode 100644 OpenUtau.Core/Hts/HTSLabelPhonemizer.cs create mode 100644 OpenUtau.Core/Hts/HTSLabelRenderer.cs create mode 100644 OpenUtau.Core/Util/HTS.cs rename {OpenUtau.Plugin.Builtin/EnunuOnnx => OpenUtau.Core/Util}/HTSLabelFile.cs (99%) rename {OpenUtau.Plugin.Builtin/EnunuOnnx => OpenUtau.Core/Util}/Merlin.cs (98%) rename {OpenUtau.Plugin.Builtin/EnunuOnnx => OpenUtau.Core/Util}/Python.cs (94%) rename {OpenUtau.Plugin.Builtin/EnunuOnnx => OpenUtau.Core/Util}/Scaler.cs (97%) delete mode 100644 OpenUtau.Plugin.Builtin/EnunuOnnx/HTS.cs create mode 100644 OpenUtau.Test/Core/Util/HtsSpecTests.cs create mode 100644 OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs diff --git a/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs new file mode 100644 index 000000000..d3c7f5f46 --- /dev/null +++ b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs @@ -0,0 +1,675 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using K4os.Hash.xxHash; +using OpenUtau.Api; +using OpenUtau.Core.Ustx; +using OpenUtau.Core.Util; +using OpenUtau.Core.Util.nnmnkwii.io.hts; +using Serilog; +using static System.Net.Mime.MediaTypeNames; + +namespace OpenUtau.Core.Hts { + public abstract class HTSLabelPhonemizer : MachineLearningPhonemizer { + protected USinger singer; + //information used by HTS writer + protected Dictionary phoneDict = new Dictionary(); + protected List vowels = new List(); + protected List consonants = new List(); + protected List breaks = new List(); + protected List pauses = new List(); + protected List silences = new List(); + protected List unvoiced = new List(); + protected string lang = ""; + int key = 0; + int resolution = 480; + + //information used by openutau phonemizer + protected IG2p g2p; + //result caching + private Dictionary>> partResult = new Dictionary>>(); + + protected string tmpPath = string.Empty; + protected string tablePath = string.Empty; + protected string questionPath = string.Empty; + protected string htstmpPath = string.Empty; + protected string monoScorePath = string.Empty; + protected string fullScorePath = string.Empty; + protected string monoTimingPath = string.Empty; + protected string fullTimingPath = string.Empty; + + public HTSLabelPhonemizer() { + + } + + public override void SetSinger(USinger singer) { + this.singer = singer; + if (singer == null) { + return; + } + phoneDict.Clear(); + //Load enuconfig + string rootPath; + if (File.Exists(Path.Join(singer.Location, "enunux", "enuconfig.yaml"))) { + rootPath = Path.Combine(singer.Location, "enunux"); + } + if (File.Exists(Path.Join(singer.Location, "enuconfig.yaml"))) { + rootPath = Path.Combine(singer.Location, "enunux"); + } else { + rootPath = singer.Location; + } + //Load g2p from enunux.yaml + //g2p dict should be load after enunu dict + try { + g2p = LoadG2p(singer.Location); + } catch (Exception e) { + Log.Error(e, "failed to load g2p dictionary"); + return; + } + //Load Dictionary + var enunuDictPath = Path.Join(rootPath, tablePath); + try { + LoadDict(Path.Join(rootPath, tablePath), singer.TextFileEncoding); + } catch (Exception e) { + Log.Error(e, $"failed to load dictionary from {enunuDictPath}"); + return; + } + } + + protected virtual IG2p LoadG2p(string rootPath) { + var g2ps = new List(); + + var enunuxPath = Path.Combine(rootPath, "enunux.yaml"); + var builder = G2pDictionary.NewBuilder(); + // Load dictionary from enunux.yaml and nnsvs dict + if (File.Exists(enunuxPath)) { + try { + var input = File.ReadAllText(enunuxPath, singer.TextFileEncoding); + var data = Yaml.DefaultDeserializer.Deserialize(input); + if (data.symbols != null) { + foreach (var symbolData in data.symbols) { + builder.AddSymbol(symbolData.symbol, symbolData.type); + } + } + foreach (var grapheme in phoneDict.Keys) { + builder.AddEntry(grapheme, phoneDict[grapheme]); + } + if (data.entries != null) { + foreach (var entry in data.entries) { + builder.AddEntry(entry.grapheme, entry.phonemes); + } + } + } catch (Exception e) { + Log.Error(e, $"Failed to load Dictionary"); + } + } + foreach (var entry in phoneDict.Keys) { + builder.AddEntry(entry, phoneDict[entry]); + } + g2ps.Add(builder.Build()); + return new G2pFallbacks(g2ps.ToArray()); + } + + public void LoadDict(string path, Encoding encoding) { + if (path.EndsWith(".conf")) { + LoadConf(path, encoding); + } else { + LoadTable(path, encoding); + } + } + + public void LoadTable(string path, Encoding encoding) { + var lines = File.ReadLines(path, encoding); + foreach (var line in lines) { + var lineSplit = line.Split(); + phoneDict[lineSplit[0]] = lineSplit[1..]; + } + } + + public void LoadConf(string path, Encoding encoding) { + phoneDict["SILENCES"] = new string[] { "sil" }; + phoneDict["PAUSES"] = new string[] { "pau" }; + phoneDict["BREAK"] = new string[] { "br" }; + var lines = File.ReadLines(path, encoding); + foreach (var line in lines) { + if (line.Contains('=')) { + var lineSplit = line.Split("="); + var key = lineSplit[0]; + var value = lineSplit[1]; + var phonemes = value.Trim(new char[] { '\"' }).Split(","); + phoneDict[key] = phonemes; + } + } + } + + public override void SetUp(Note[][] notes, UProject project, UTrack track) { + key = project.key; + resolution = project.resolution; + //将全曲拆分为句子 + var phrase = new List { notes[0] }; + for (var i = 1; i < notes.Length; ++i) { + //如果上下音符相互衔接,则不分句 + if (notes[i - 1][^1].position + notes[i - 1][^1].duration == notes[i][0].position) { + phrase.Add(notes[i]); + } else { + //如果断开了,则处理当前句子,并开启下一句 + ProcessPart(phrase.ToArray()); + phrase.Clear(); + phrase.Add(notes[i]); + } + } + if (phrase.Count > 0) { + ProcessPart(phrase.ToArray()); + } + } + + protected (string prefix, string suffix) GetPrefixAndSuffix(Note note) { + var prefix = string.Empty; + var suffix = string.Empty; + + var textList = note.lyric.Split().ToList(); + var splitFlag = true; + foreach (var text in textList) { + var existSymbol = g2p.IsValidSymbol(text); + if (existSymbol) { + splitFlag = false; + continue; + } else if (existSymbol && !splitFlag) { + splitFlag = true; + continue; + } + if (splitFlag) { + prefix += text; + } else { + suffix += text; + } + } + + return (prefix, suffix); + } + + protected abstract HTSNote CustomHTSNoteContext(HTSNote htsNote, Note note); + + //make a HTS Note from given symbols and UNotes + //TODO:Fix the processing for rests + protected HTSNote makeHtsNote(string[] symbols, IList group, int startTick) { + var htsNote = HTSContextBuilder.BuildNote( + symbols, + group[0].tone, + IsSyllableVowelExtensionNote(group[0]), + lang, + key, + timeAxis, + group[0].position, + group[^1].position + group[^1].duration, + startTick, + 0, + symbol => pauses.Contains(symbol) || silences.Contains(symbol) || breaks.Contains(symbol)); + return CustomHTSNoteContext(htsNote, group[0]) ?? htsNote; + } + + protected HTSNote makeHtsNote(string symbol, Note[] group, int startTick) { + return makeHtsNote(new string[] { symbol }, group, startTick); + } + + protected bool IsSyllableVowelExtensionNote(Note note) { + return note.lyric.StartsWith("+~") || note.lyric.StartsWith("+*"); + } + + private string[] ApplyExtensions(string[] symbols, Note[] notes) { + var newSymbols = new List(); + var vowelIds = ExtractVowels(symbols); + if (vowelIds.Count == 0) { + // no syllables or all consonants, the last phoneme will be interpreted as vowel + vowelIds.Add(symbols.Length - 1); + } + var lastVowelI = 0; + newSymbols.AddRange(symbols.Take(vowelIds[lastVowelI] + 1)); + for (var i = 1; i < notes.Length && lastVowelI + 1 < vowelIds.Count; i++) { + if (!IsSyllableVowelExtensionNote(notes[i])) { + var prevVowel = vowelIds[lastVowelI]; + lastVowelI++; + var vowel = vowelIds[lastVowelI]; + newSymbols.AddRange(symbols.Skip(prevVowel + 1).Take(vowel - prevVowel)); + } else { + newSymbols.Add(symbols[vowelIds[lastVowelI]]); + } + } + newSymbols.AddRange(symbols.Skip(vowelIds[lastVowelI] + 1)); + return newSymbols.ToArray(); + } + + private List ExtractVowels(string[] symbols) { + var vowelIds = new List(); + for (var i = 0; i < symbols.Length; i++) { + if (g2p.IsVowel(symbols[i])) { + vowelIds.Add(i); + } + } + return vowelIds; + } + + protected virtual Note[] HandleNotEnoughNotes(Note[] notes, List vowelIds) { + var newNotes = new List(); + newNotes.AddRange(notes.SkipLast(1)); + var lastNote = notes.Last(); + var position = lastNote.position; + var notesToSplit = vowelIds.Count - newNotes.Count; + var duration = lastNote.duration / notesToSplit / 15 * 15; + for (var i = 0; i < notesToSplit; i++) { + var durationFinal = i != notesToSplit - 1 ? duration : lastNote.duration - duration * (notesToSplit - 1); + newNotes.Add(new Note() { + position = position, + duration = durationFinal, + tone = lastNote.tone, + phonemeAttributes = lastNote.phonemeAttributes + }); + position += durationFinal; + } + + return newNotes.ToArray(); + } + + protected virtual Note[] HandleExcessNotes(Note[] notes, List vowelIds) { + var newNotes = new List(); + var SyllableCount = vowelIds.Count; + newNotes.AddRange(notes.Take(SyllableCount - 1)); + var lastNote = notes[SyllableCount - 1]; + newNotes.Add(new Note() { + lyric = lastNote.lyric, + phoneticHint = lastNote.phoneticHint, + position = lastNote.position, + duration = notes[(SyllableCount - 1)..].Select(note => note.duration).Sum(), + tone = lastNote.tone, + phonemeAttributes = lastNote.phonemeAttributes + }); + return newNotes.ToArray(); + } + + public string GetPhonemeType(string phoneme) { + if (phoneme == "xx") { + return "xx"; + } + if (vowels.Contains(phoneme)) { + return "v"; + } + if (pauses.Contains(phoneme)) { + return "p"; + } + if (silences.Contains(phoneme)) { + return "s"; + } + if (breaks.Contains(phoneme)) { + return "b"; + } + //if (unvoiced.Contains(phoneme)) { + // return "c"; + //} + return "c"; + } + + string[] GetSymbols(Note note) { + //priority: + //1. phonetic hint + //2. query from g2p dictionary + //3. treat lyric as phonetic hint, including single phoneme + //4. default pause + if (!string.IsNullOrEmpty(note.phoneticHint)) { + // Split space-separated symbols into an array. + return note.phoneticHint.Split() + .Where(s => g2p.IsValidSymbol(s)) // skip the invalid symbols. + .ToArray(); + } + // User has not provided hint, query g2p dictionary. + var g2presult = g2p.Query(note.lyric.ToLowerInvariant()); + if (g2presult != null) { + return g2presult; + } + //not founded in g2p dictionary, treat lyric as phonetic hint + var lyricSplited = note.lyric.Split() + .Where(s => g2p.IsValidSymbol(s)) // skip the invalid symbols. + .ToArray(); + if (lyricSplited.Length > 0) { + return lyricSplited; + } + return new string[] { "pau" }; + } + + private (string[], int[], Note[]) GetSymbolsAndVowels(Note[] notes) { + var mainNote = notes[0]; + var symbols = GetSymbols(mainNote); + if (symbols == null) { + return (null, null, null); + } + if (symbols.Length == 0) { + symbols = new string[] { "" }; + } + symbols = ApplyExtensions(symbols, notes); + var vowelIds = ExtractVowels(symbols); + if (vowelIds.Count == 0) { + // no syllables or all consonants, the last phoneme will be interpreted as vowel + vowelIds.Add(symbols.Length - 1); + } + if (notes.Length < vowelIds.Count) { + notes = HandleNotEnoughNotes(notes, vowelIds); + } else if (notes.Length > vowelIds.Count) { + notes = HandleExcessNotes(notes, vowelIds); + } + return (symbols, vowelIds.ToArray(), notes); + } + + protected struct Syllable { + public List symbols; + public List notes; + } + + protected virtual HTSNote[] MakeSyllables(Note[] inputNotes, int startTick) { + (var symbols, var vowelIds, var notes) = GetSymbolsAndVowels(inputNotes); + if (symbols == null || vowelIds == null || notes == null) { + return null; + } + var firstVowelId = vowelIds[0]; + if (notes.Length < vowelIds.Length) { + //error = $"Not enough extension notes, {vowelIds.Length - notes.Length} more expected"; + return null; + } + + var syllables = new Syllable[vowelIds.Length]; + + // Making the first syllable + + // there is only empty space before us + syllables[0] = new Syllable() { + symbols = symbols.Take(firstVowelId + 1).ToList(), + notes = notes[0..1].ToList() + }; + + // normal syllables after the first one + var noteI = 1; + var ccs = new List(); + var position = 0; + var lastSymbolI = firstVowelId + 1; + for (; lastSymbolI < symbols.Length; lastSymbolI++) { + if (!vowelIds.Contains(lastSymbolI)) { + ccs.Add(symbols[lastSymbolI]); + } else { + position += notes[noteI - 1].duration; + syllables[noteI] = new Syllable() { + symbols = ccs.Append(symbols[lastSymbolI]).ToList(), + notes = new List() { notes[noteI] } + }; + ccs = new List(); + noteI++; + } + } + syllables[^1].symbols.AddRange(ccs); + return syllables.Select(x => makeHtsNote(x.symbols.ToArray(), x.notes, startTick)).ToArray(); + } + + HTSPhoneme[] HTSNoteToPhonemes(HTSNote htsNote) { + var htsPhonemes = htsNote.symbols.Select(x => new HTSPhoneme(x, htsNote)).ToArray(); + // 音節内の音素に対して、タイプ(母音/子音/休符など)や位置情報を付与 + foreach (var i in Enumerable.Range(0, htsPhonemes.Length)) { + htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); + htsPhonemes[i].position = i + 1; + htsPhonemes[i].position_backward = htsPhonemes.Length - i; + } + foreach (var i in Enumerable.Range(0, htsPhonemes.Length)) { + if (htsPhonemes[i].type.Equals("c")) { + var prev = i - 1; + if (prev >= 0) { + if (htsPhonemes[prev].type.Equals("v")) { + htsPhonemes[i].prev_vowel_distance = 1; + } else if (htsPhonemes[prev].prev_vowel_distance > 0) { + htsPhonemes[i].prev_vowel_distance = htsPhonemes[prev].prev_vowel_distance + 1; + } else { + htsPhonemes[i].prev_vowel_distance = 0; + } + } + } + } + for (var i = htsPhonemes.Length - 1; i >= 0; --i) { + if (htsPhonemes[i].type.Equals("c")) { + var next = i + 1; + if (next < htsPhonemes.Length) { + if (htsPhonemes[next].type.Equals("v")) { + htsPhonemes[i].next_vowel_distance = 1; + } else if (htsPhonemes[next].next_vowel_distance > 0) { + htsPhonemes[i].next_vowel_distance = htsPhonemes[next].next_vowel_distance + 1; + } else { + htsPhonemes[i].next_vowel_distance = 0; + } + } + } + } + return htsPhonemes; + } + + protected abstract void SendScore(Note[][] phrase); + + ulong HashPhraseGroups(Note[][] phrase) { + using (var stream = new MemoryStream()) { + using (var writer = new BinaryWriter(stream)) { + writer.Write(phrase.ToString()); + foreach (var phone in phrase) { + writer.Write(phone[0].lyric); + if (phone[0].phoneticHint != null) { + writer.Write("[" + phone[0].phoneticHint + "]"); + } + var attr = phone[0].phonemeAttributes?.FirstOrDefault(attr => attr.index == 0) ?? default; + writer.Write(attr.toneShift); + writer.Write(phone[0].position); + writer.Write(phone[0].duration); + } + return XXH64.DigestOf(stream.ToArray()); + } + } + } + + protected abstract Note[][] PhraseAdjustments(Note[][] phrese); + + protected abstract HTSPhoneme[] CustomHTSPhonemeContext(HTSPhoneme[] htsPhonemes, Note[] notes); + + + protected override void ProcessPart(Note[][] phrase) { + tmpPath = Path.Join(PathManager.Inst.CachePath, $"lab-{HashPhraseGroups(phrase):x16}"); + htstmpPath = tmpPath + "_htstemp"; + fullScorePath = Path.Join(htstmpPath, $"full_score.lab"); + fullTimingPath = Path.Join(htstmpPath, $"full_timing.lab"); + monoScorePath = Path.Join(htstmpPath, $"mono_score.lab"); + monoTimingPath = Path.Join(htstmpPath, $"mono_timing.lab"); + + phrase = PhraseAdjustments(phrase) ?? phrase; + + var startTick = phrase[0][0].position; + var endTick = phrase[^1][^1].position + phrase[^1][^1].duration; + + // パディングを小節長で設定(開始・終了ともに1小節) + var sigStart = timeAxis.TimeSignatureAtTick(startTick); + var bpmStart = timeAxis.GetBpmAtTick(startTick); + var barLenMsStart = (int)Math.Round(60000.0 / bpmStart * sigStart.beatPerBar); + var barLenTicksStart = timeAxis.MsPosToTickPos(barLenMsStart); + + var sigEnd = timeAxis.TimeSignatureAtTick(endTick); + var bpmEnd = timeAxis.GetBpmAtTick(endTick); + var barLenMsEnd = (int)Math.Round(60000.0 / bpmEnd * sigEnd.beatPerBar); + var barLenTicksEnd = timeAxis.MsPosToTickPos(barLenMsEnd); + + // 文全体の長さ(開始1小節 + 本体 + 終了1小節) + var sentenceDurMs = barLenMsStart + (int)timeAxis.MsBetweenTickPos(startTick, endTick) + barLenMsEnd; + var sentenceDurTicks = barLenTicksStart + (endTick - startTick) + barLenTicksEnd; + + var notePhIndex = new List { 1 }; // 先頭パディング分 + var phAlignPoints = new List>(); + + // 先頭パディング pau + timeAxis.TickPosToBarBeat(startTick - barLenTicksStart, out var barStart, out var beatStart, out var _); + var sigForPadStart = timeAxis.TimeSignatureAtTick(startTick - barLenTicksStart); + var PaddingNoteStart = new HTSNote( + symbols: new string[] { "pau" }, + beatPerBar: sigForPadStart.beatPerBar, + beatUnit: sigForPadStart.beatUnit, + positionBar: barStart, + positionBeat: beatStart, + key: key, + bpm: timeAxis.GetBpmAtTick(startTick - barLenTicksStart), + tone: 0, + isSlur: false, + isRest: true, + lang: string.Empty, + accent: string.Empty, + startms: 0, + endms: barLenMsStart, + positionTicks: startTick - barLenTicksStart, + durationTicks: barLenTicksStart + ); + var htsNotes = new List { PaddingNoteStart }; + var htsPhonemes = new List(); + htsPhonemes.AddRange(CustomHTSPhonemeContext(HTSNoteToPhonemes(PaddingNoteStart), phrase[0])); + + // 楽譜ノート → HTSノート + for (var n = 0; n < phrase.Length; ++n) { + var Syllables = MakeSyllables(phrase[n], startTick); + // 各ノートの start/end を「開始パディング加算」ベースに + foreach (var note in Syllables) { + note.startMs += barLenMsStart; + note.endMs += barLenMsStart; + } + htsNotes.AddRange(Syllables); + + for (var noteIndex = 0; noteIndex < Syllables.Length; noteIndex++) { + var htsNote = Syllables[noteIndex]; + var tmpPhonemes = HTSNoteToPhonemes(htsNote); + var notePhonemes = CustomHTSPhonemeContext(tmpPhonemes, phrase[n]) ?? tmpPhonemes; + + // 第1母音位置をアンカーに(絶対ms) + var firstVowelIndex = 0; + for (var phIndex = 0; phIndex < htsNote.symbols.Length; phIndex++) { + if (g2p.IsVowel(htsNote.symbols[phIndex])) { + firstVowelIndex = phIndex; + break; + } + } + phAlignPoints.Add(Tuple.Create( + htsPhonemes.Count + firstVowelIndex, + timeAxis.TickPosToMsPos(htsNote.positionTicks) + barLenMsStart + )); + htsPhonemes.AddRange(notePhonemes); + } + notePhIndex.Add(htsPhonemes.Count); + } + + // 終端パディング pau(位置は「本当の曲末」tick) + timeAxis.TickPosToBarBeat(endTick, out var barEnd, out var beatEnd, out var _); + var PaddingNoteEnd = new HTSNote( + symbols: new string[] { "pau" }, + beatPerBar: sigEnd.beatPerBar, + beatUnit: sigEnd.beatUnit, + positionBar: barEnd, + positionBeat: beatEnd, + key: key, + bpm: bpmEnd, + tone: 0, + isSlur: false, + isRest: true, + lang: string.Empty, + accent: string.Empty, + // 絶対msで末尾に配置 + startms: sentenceDurMs - barLenMsEnd, + endms: sentenceDurMs, + positionTicks: endTick, + durationTicks: barLenTicksEnd + ); + htsNotes.Add(PaddingNoteEnd); + htsPhonemes.AddRange(CustomHTSPhonemeContext(HTSNoteToPhonemes(PaddingNoteEnd), phrase[^1])); + + // 末尾アンカーは「曲末+終端パディング」位置 + var lastNote = htsNotes[^1]; + phAlignPoints.Add(Tuple.Create( + htsPhonemes.Count, + timeAxis.TickPosToMsPos(lastNote.positionTicks + lastNote.durationTicks) + barLenMsStart // = sentenceDurMs + )); + var htsPhrase = new HTSPhrase(htsNotes.ToArray()); + htsPhrase.UpdateResolution(resolution); + htsPhrase.totalNotes = htsNotes.Count - 2; + htsPhrase.totalPhonemes = htsPhonemes.Count - 3; + htsPhrase.totalPhrases = 1; + //make neighborhood links between htsNotes and between htsPhonemes + foreach (var i in Enumerable.Range(0, htsNotes.Count)) { + htsNotes[i].parent = htsPhrase; + htsNotes[i].index = i; + htsNotes[i].indexBackwards = htsNotes.Count - i - 1; + htsNotes[i].sentenceDurMs = sentenceDurMs; + htsNotes[i].sentenceDurTicks = sentenceDurTicks; + if (i > 0) { + htsNotes[i].prev = htsNotes[i - 1]; + htsNotes[i - 1].next = htsNotes[i]; + } + } + for (var i = 1; i < htsPhonemes.Count; ++i) { + htsPhonemes[i].prev = htsPhonemes[i - 1]; + htsPhonemes[i - 1].next = htsPhonemes[i]; + } + + try { + if (!Directory.Exists(htstmpPath)) { + Directory.CreateDirectory(htstmpPath); + } + File.WriteAllLines(fullScorePath, htsPhonemes.Select(x => x.dump())); + } catch (Exception e) { + Log.Error(e.ToString()); + throw; + } + + SendScore(phrase); + if (!File.Exists(monoTimingPath)) { + Log.Error($"File not found.:{monoTimingPath}"); + return; + } + + var hTSLabels = hts.load(monoTimingPath, Encoding.UTF8); + + // 100ns -> ms は 10000 で割る + var labPositions = + hTSLabels.Skip(1).SkipLast(1).Select(label => (label.end_time - label.start_time) / 10000.0).ToList(); + labPositions.Insert(0, labPositions[0]); + labPositions.Add(labPositions[^1]); + + var positions = HTSContextBuilder.AlignTimingPositions(labPositions, phAlignPoints); + + // 出力(略) + var phonemesRedirected = htsPhonemes.Select(x => x.symbol).ToArray(); + for (var groupIndex = 0; groupIndex < phrase.Length; groupIndex++) { + var group = phrase[groupIndex]; + if (group[0].lyric.StartsWith("+")) { + continue; + } + var notePos = timeAxis.TickPosToMsPos(group[0].position) + barLenMsStart; // ms + var noteResult = HTSContextBuilder.BuildAlignedNoteTimingResult( + phonemesRedirected, + notePhIndex[groupIndex], + notePhIndex[groupIndex + 1], + positions, + notePos, + timeAxis.TicksBetweenMsPos); + partResult[group[0].position] = noteResult; + } + } + + public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevNeighbour, Note? nextNeighbour, Note[] prevs) { + if (!partResult.TryGetValue(notes[0].position, out var phonemes)) { + throw new Exception("error"); + } + return new Result { + phonemes = phonemes + .Select((tu) => new Phoneme() { + phoneme = tu.Item1, + position = tu.Item2, + }) + .ToArray(), + }; + } + } +} diff --git a/OpenUtau.Core/Hts/HTSLabelRenderer.cs b/OpenUtau.Core/Hts/HTSLabelRenderer.cs new file mode 100644 index 000000000..986505de6 --- /dev/null +++ b/OpenUtau.Core/Hts/HTSLabelRenderer.cs @@ -0,0 +1,484 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using OpenUtau.Api; +using OpenUtau.Core.Render; +using OpenUtau.Core.Ustx; +using OpenUtau.Core.Util; +using Serilog; + +namespace OpenUtau.Core.Hts { + public abstract class HTSLabelRenderer : IRenderer { + + static readonly object lockObj = new object(); + + public virtual bool SupportsRenderPitch => true; + + public abstract USingerType SingerType { get; } + + public abstract bool SupportsExpression(UExpressionDescriptor descriptor); + + protected TimeAxis timeAxis; + + //information used by HTS writer + protected Dictionary phoneDict = new Dictionary(); + protected List vowels = new List(); + protected List consonants = new List(); + protected List breaks = new List(); + protected List pauses = new List(); + protected List silences = new List(); + protected List unvoiced = new List(); + protected List macronLyrics = new List(); + protected int startTick; + protected int endTick; + protected UTimeSignature sigStart; + protected double bpmStart; + protected double headMs; + protected int barLenTicksStart; + protected UTimeSignature sigEnd; + protected double bpmEnd; + protected double tailMs; + protected int barLenTicksEnd; + protected string lang = ""; + protected int key = 0; + protected int resolution = 480; + protected int framePeriod = 5; + + //information used by openutau phonemizer + protected IG2p g2p; + //result caching + private Dictionary>> partResult = new Dictionary>>(); + protected string tablePath = string.Empty; + protected string monoScorePath = string.Empty; + protected string fullScorePath = string.Empty; + protected string monoTimingPath = string.Empty; + protected string fullTimingPath = string.Empty; + + public virtual void SetUp() { + phoneDict.Clear(); + lang = "JPN";//TODO: use singer.language + // Lyrics often handled in OpenUtau + phoneDict.Add("R", new string[] { "pau" }); + phoneDict.Add("-", new string[] { "pau" }); + phoneDict.Add("SP", new string[] { "pau" }); + phoneDict.Add("AP", new string[] { "br" }); + g2p = LoadG2p(); + } + + protected virtual void LoadDict(string path, Encoding encoding) { + if (path.EndsWith(".conf")) { + LoadConf(path, encoding); + } else { + LoadTable(path, encoding); + } + } + + private void LoadTable(string path, Encoding encoding) { + var lines = File.ReadLines(path, encoding); + foreach (var line in lines) { + var lineSplit = line.Split(); + phoneDict[lineSplit[0]] = lineSplit[1..]; + } + } + + private void LoadConf(string path, Encoding encoding) { + phoneDict["SILENCES"] = new string[] { "sil" }; + phoneDict["PAUSES"] = new string[] { "pau" }; + phoneDict["BREAK"] = new string[] { "br" }; + var lines = File.ReadLines(path, encoding); + foreach (var line in lines) { + if (line.Contains('=')) { + var lineSplit = line.Split("="); + var key = lineSplit[0]; + var value = lineSplit[1]; + var phonemes = value.Trim(new char[] { '\"' }).Split(","); + phoneDict[key] = phonemes; + } + } + } + protected IG2p LoadG2p() { + var g2ps = new List(); + var builder = G2pDictionary.NewBuilder(); + vowels.AddRange(phoneDict["VOWELS"]); + breaks.AddRange(phoneDict["BREAK"]); + pauses.AddRange(phoneDict["PAUSES"]); + silences.AddRange(phoneDict["SILENCES"]); + consonants.AddRange(phoneDict["PHONEME_CL"]); + macronLyrics.AddRange(phoneDict["MACRON"]); + foreach (var dict in phoneDict.Values) { + foreach (var phoneme in dict) { + if (!consonants.Contains(phoneme) && !vowels.Contains(phoneme) && + !breaks.Contains(phoneme) && !pauses.Contains(phoneme) && + !silences.Contains(phoneme)) { + consonants.Add(phoneme); + } + if (!consonants.Contains(phoneme)) { + builder.AddSymbol(phoneme, true); + } else { + builder.AddSymbol(phoneme, false); + } + } + } + foreach (var entry in phoneDict.Keys) { + builder.AddEntry(entry, phoneDict[entry]); + foreach (var reduction in phoneDict["VOWEL_REDUCTION"]) { + var phonemes = phoneDict[entry].Except(vowels).ToList(); + if (phonemes.Count == 0) continue; + builder.AddEntry(entry + reduction, phonemes); + } + foreach (var macron in phoneDict["MACRON"]) { + var addPhonemes = phoneDict[entry].Where(x => vowels.Contains(x)).ToList(); + if (addPhonemes.Count == 0) continue; + var phonemes = phoneDict[entry].ToList(); + phonemes.AddRange(addPhonemes); + builder.AddEntry(entry + macron, phonemes); + macronLyrics.Add(entry + macron); + } + } + g2ps.Add(builder.Build()); + return new G2pFallbacks(g2ps.ToArray()); + } + + + + protected (string prefix, string suffix) GetPrefixAndSuffix(RenderNote note) { + string prefix = string.Empty; + string suffix = string.Empty; + + var textList = note.lyric.Split().ToList(); + bool splitFlag = true; + foreach (var text in textList) { + var existSymbol = g2p.IsValidSymbol(text); + if (existSymbol) { + splitFlag = false; + continue; + } else if (existSymbol && !splitFlag) { + splitFlag = true; + continue; + } + if (splitFlag) { + prefix += text; + } else { + suffix += text; + } + } + + return (prefix, suffix); + } + + private RenderPhone FindLastVowelOrLastPhoneme(RenderPhone[] phonemes) { + for (int i = phonemes.Length - 1; i >= 0; --i) { + if (g2p.IsVowel(phonemes[i].phoneme)) { + return phonemes[i]; + } + } + return phonemes[^1]; + } + + protected virtual HTSNote CustomHTSNoteContext(HTSNote htsNote, RenderNote note) { + var fixs = GetPrefixAndSuffix(note); + if (!htsNote.isRest && !htsNote.isSlur) { + htsNote.langDependent = "0"; // no macron + if (macronLyrics.Contains(note.lyric)) { + htsNote.langDependent = "1"; // macron + } + } + return htsNote; + } + + //make a HTS Note from given symbols and UNotes + private HTSNote makeHtsNote(string[] symbols, RenderNote note, int startTick, double leadingMs) { + var positiontick = startTick + note.position; + var endTick = positiontick + note.duration; + UTimeSignature sig = timeAxis.TimeSignatureAtTick(positiontick); + timeAxis.TickPosToBarBeat(positiontick, out int bar, out int beat, out int remainingTicks); + var isRest = symbols.Select(x => x.ToLowerInvariant()).Any(x => pauses.Contains(x) || silences.Contains(x) || breaks.Contains(x)); + var htsNote = new HTSNote( + symbols: symbols, + tone: note.tone, + isSlur: IsSyllableVowelExtensionNote(note), + isRest: isRest, + lang: isRest ? string.Empty : lang, + accent: string.Empty, + beatPerBar: sig.beatPerBar, + beatUnit: sig.beatUnit, + positionBar: bar, + positionBeat: beat, + key: key, + bpm: timeAxis.GetBpmAtTick(positiontick), + startms: timeAxis.MsBetweenTickPos(startTick, positiontick) + leadingMs, + endms: timeAxis.MsBetweenTickPos(startTick, endTick) + leadingMs, + positionTicks: positiontick, + durationTicks: note.duration + ); + return CustomHTSNoteContext(htsNote, note) ?? htsNote; + } + private HTSNote makeHtsNote(string symbol, RenderNote note, int startTick, double leadingMs) { + return makeHtsNote(new string[] { symbol }, note, startTick, leadingMs); + } + + protected virtual bool IsSyllableVowelExtensionNote(RenderNote note) { + return note.lyric.StartsWith("+~") || note.lyric.StartsWith("+*"); + } + + private string GetPhonemeType(string phoneme) { + if (phoneme == "xx") { + return "xx"; + } + if (vowels.Contains(phoneme)) { + return "v"; + } + if (pauses.Contains(phoneme)) { + return "p"; + } + if (silences.Contains(phoneme)) { + return "s"; + } + if (breaks.Contains(phoneme)) { + return "b"; + } + //if (unvoiced.Contains(phoneme)) { + // return "c"; + //} + return "c"; + } + + private HTSPhoneme[] HTSNoteToPhonemes(HTSNote htsNote) { + var htsPhonemes = htsNote.symbols.Select(x => new HTSPhoneme(x, htsNote)).ToArray(); + foreach (int i in Enumerable.Range(0, htsPhonemes.Length)) { + htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); + htsPhonemes[i].position = i + 1; + htsPhonemes[i].position_backward = htsPhonemes.Length - i; + if (htsPhonemes[i].type.Equals("c")) { + int prev = i - 1; + if (prev >= 0) { + if (htsPhonemes[prev].type.Equals("v")) { + htsPhonemes[i].prev_vowel_distance = 1; + } else { + htsPhonemes[i].prev_vowel_distance = htsPhonemes[prev].prev_vowel_distance + 1; + } + } + } + } + for (int i = htsPhonemes.Length - 1; i > 0; --i) { + if (htsPhonemes[i].type.Equals("c")) { + int next = i + 1; + if (next < htsPhonemes.Length) { + if (htsPhonemes[next].type.Equals("v")) { + htsPhonemes[i].next_vowel_distance = 1; + } else { + htsPhonemes[i].next_vowel_distance = htsPhonemes[next].next_vowel_distance + 1; + } + } + } + } + return htsPhonemes; + } + + protected abstract HTSPhoneme[] CustomHTSPhonemeContext(HTSPhoneme[] htsPhonemes, RenderNote notes); + + private struct monoLabel { + public string symbol; + public double startMs; + public double endMs; + public override string ToString() { + return $"{(long)Math.Round(startMs * 10000.0)} {(long)Math.Round(endMs * 10000.0)} {symbol}"; + } + } + + public void ProcessPart(RenderPhrase phrase) { + if (timeAxis == null) { + timeAxis = phrase.timeAxis; + } + + int startTick = phrase.position; + int endTick = phrase.position + phrase.duration; + + // 文全体の長さ(開始1小節 + 本体 + 終了1小節) + double sentenceDurMs = headMs + phrase.endMs - phrase.positionMs + tailMs; + int sentenceDurTicks = barLenTicksStart + (endTick - startTick) + barLenTicksEnd; + + // 先頭パディング pau + timeAxis.TickPosToBarBeat(startTick - barLenTicksStart, out int barStart, out int beatStart, out int _); + var sigForPadStart = timeAxis.TimeSignatureAtTick(startTick - barLenTicksStart); + + + List monoLabels_ = new List(); + double phonemeDuration = 0; + + HTSNote PaddingNoteStart = new HTSNote( + symbols: new string[] { "pau" }, + beatPerBar: sigForPadStart.beatPerBar, + beatUnit: sigForPadStart.beatUnit, + positionBar: barStart, + positionBeat: beatStart, + key: key, + bpm: timeAxis.GetBpmAtTick(startTick - barLenTicksStart), + tone: 0, + isSlur: false, + isRest: true, + lang: string.Empty, + accent: string.Empty, + startms: 0, + endms: headMs, + positionTicks: startTick - barLenTicksStart, + durationTicks: barLenTicksStart + ); + var htsNotes = new List { PaddingNoteStart }; + var htsPhonemes = new List(); + htsPhonemes.AddRange(HTSNoteToPhonemes(PaddingNoteStart)); + + monoLabels_.Add(new monoLabel() { + symbol = htsPhonemes[0].symbol, + startMs = phonemeDuration, + endMs = headMs + }); + phonemeDuration += headMs; + + //Alignment + var phonemesByNoteIndex = phrase.phones + .GroupBy(phone => phone.noteIndex) + .ToDictionary( + group => group.Key, + group => group.Select(phone => phone).ToArray()); + var lastBasePhonemes = Array.Empty(); + var tuples = new List>(); + for (int noteIndex = 0; noteIndex < phrase.notes.Length; noteIndex++) { + var note = phrase.notes[noteIndex]; + if (phonemesByNoteIndex.TryGetValue(noteIndex, out var phonemes)) { + foreach (var phone in phonemes) { + monoLabels_.Add(new monoLabel() { + symbol = phone.phoneme, + startMs = phonemeDuration, + endMs = phonemeDuration + phone.durationMs + }); + phonemeDuration += phone.durationMs; + } + + lastBasePhonemes = phonemes; + HTSNote htsNote = makeHtsNote(phonemes.Select(phone => phone.phoneme).ToArray(), note, startTick, headMs); + tuples.Add(Tuple.Create(htsNote, noteIndex)); + } else if (IsSyllableVowelExtensionNote(note)) { + // 拍点延長ノートは、直前の通常ノートの最後の母音を引き延ばす + var extensionPhoneme = FindLastVowelOrLastPhoneme(lastBasePhonemes); + if (!string.IsNullOrEmpty(extensionPhoneme.phoneme)) { + var extensionStartMs = note.positionMs - phrase.positionMs + headMs; + var extensionEndMs = note.endMs - phrase.positionMs + headMs; + + monoLabels_.Add(new monoLabel() { + symbol = extensionPhoneme.phoneme, + startMs = phonemeDuration, + endMs = phonemeDuration + note.durationMs + }); + phonemeDuration += note.durationMs; + + HTSNote htsNote = makeHtsNote(extensionPhoneme.phoneme, note, startTick, headMs); + tuples.Add(Tuple.Create(htsNote, noteIndex)); + } + } else { + continue; + } + } + for (int i = 0; i < tuples.Count; i++) { + var htsNote = tuples[i].Item1; + htsNotes.Add(htsNote); + htsNote.index = i; + htsNote.indexBackwards = htsNotes.Count - i; + htsNote.sentenceDurMs = sentenceDurMs; + htsNote.sentenceDurTicks = sentenceDurTicks; + var tmpPhonemes = HTSNoteToPhonemes(htsNote); + var notePhonemes = CustomHTSPhonemeContext(tmpPhonemes, phrase.notes[tuples[i].Item2]) ?? tmpPhonemes; + htsPhonemes.AddRange(notePhonemes); + } + // 終端パディング pau(位置は「本当の曲末」tick) + timeAxis.TickPosToBarBeat(endTick, out int barEnd, out int beatEnd, out int _); + HTSNote PaddingNoteEnd = new HTSNote( + symbols: new string[] { "pau" }, + beatPerBar: sigEnd.beatPerBar, + beatUnit: sigEnd.beatUnit, + positionBar: barEnd, + positionBeat: beatEnd, + key: key, + bpm: bpmEnd, + tone: 0, + isSlur: false, + isRest: true, + lang: string.Empty, + accent: string.Empty, + // 絶対msで末尾に配置 + startms: sentenceDurMs - tailMs, + endms: sentenceDurMs, + positionTicks: endTick, + durationTicks: barLenTicksEnd + ); + htsNotes.Add(PaddingNoteEnd); + htsPhonemes.AddRange(HTSNoteToPhonemes(PaddingNoteEnd)); + + monoLabels_.Add(new monoLabel() { + symbol = htsPhonemes[^1].symbol, + startMs = phonemeDuration, + endMs = sentenceDurMs + }); + + var htsPhrase = new HTSPhrase(htsNotes.ToArray()); + htsPhrase.UpdateResolution(resolution); + htsPhrase.totalNotes = htsNotes.Count - 1; + htsPhrase.totalPhonemes = htsPhonemes.Count - 1; + htsPhrase.totalPhrases = 1; + //make neighborhood links between htsNotes and between htsPhonemes + foreach (int i in Enumerable.Range(0, htsNotes.Count)) { + htsNotes[i].parent = htsPhrase; + if (i > 0) { + htsNotes[i].prev = htsNotes[i - 1]; + htsNotes[i - 1].next = htsNotes[i]; + } + } + for (int i = 1; i < htsPhonemes.Count; ++i) { + htsPhonemes[i].prev = htsPhonemes[i - 1]; + htsPhonemes[i - 1].next = htsPhonemes[i]; + } + + try { + File.WriteAllLines(fullScorePath, htsPhonemes.Select(x => x.dump())); + File.WriteAllLines(monoTimingPath, monoLabels_.Select(x => x.ToString())); + } catch (Exception e) { + Log.Error(e.ToString()); + throw e; + } + } + + public virtual RenderResult Layout(RenderPhrase phrase) { + if (timeAxis == null) { + timeAxis = phrase.timeAxis; + } + startTick = phrase.position; + endTick = phrase.position + phrase.duration; + + // パディングを小節長で設定(開始・終了ともに1小節) + sigStart = timeAxis.TimeSignatureAtTick(startTick); + bpmStart = timeAxis.GetBpmAtTick(startTick); + headMs = (int)Math.Round((60000.0 / bpmStart) * sigStart.beatPerBar); + + sigEnd = timeAxis.TimeSignatureAtTick(endTick); + bpmEnd = timeAxis.GetBpmAtTick(endTick); + tailMs = (int)Math.Round((60000.0 / bpmEnd) * sigEnd.beatPerBar); + return new RenderResult() { + leadingMs = headMs, + positionMs = phrase.positionMs, + estimatedLengthMs = headMs + phrase.durationMs + tailMs, + }; + } + + public abstract Task Render(RenderPhrase phrase, Progress progress, int trackNo, CancellationTokenSource cancellation, bool isPreRender); + + public abstract UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSettings renderSettings); + + public abstract override string ToString(); + + public abstract RenderPitchResult LoadRenderedPitch(RenderPhrase phrase); + } +} diff --git a/OpenUtau.Core/Render/RenderPhrase.cs b/OpenUtau.Core/Render/RenderPhrase.cs index 048fd64f6..bf302843e 100644 --- a/OpenUtau.Core/Render/RenderPhrase.cs +++ b/OpenUtau.Core/Render/RenderPhrase.cs @@ -72,13 +72,13 @@ public class RenderPhone { public readonly bool direct; public readonly Vector2[] envelope; - // voicevox & enunu args + // voicevox & enunu & neutrino args public readonly int toneShift; public readonly UOto oto; public readonly ulong hash; - internal RenderPhone(UProject project, UTrack track, UVoicePart part, UNote note, UPhoneme phoneme, int phrasePosition) { + internal RenderPhone(UProject project, UTrack track, UVoicePart part, UNote note, UPhoneme phoneme, int phrasePosition, int noteIndex) { position = part.position + phoneme.position - phrasePosition; duration = phoneme.Duration; end = position + duration; @@ -90,6 +90,7 @@ internal RenderPhone(UProject project, UTrack track, UVoicePart part, UNote note this.phoneme = phoneme.phoneme; tone = note.tone; + this.noteIndex = noteIndex; tempos = project.timeAxis.TemposBetweenTicks(part.position + phoneme.position - leading, part.position + phoneme.End); UTempo[] noteTempos = project.timeAxis.TemposBetweenTicks(part.position + phoneme.position, part.position + phoneme.End); tempo = noteTempos.Length > 0 ? noteTempos[0].bpm : project.tempos[0].bpm; @@ -211,12 +212,10 @@ internal RenderPhrase(UProject project, UTrack track, UVoicePart part, IEnumerab uNotes.Add(next); next = next.Next; } - if (uNotes.First().Prev != null && uNotes.First().Prev.End == uNotes.First().position) { - uNotes.Insert(0, uNotes.First().Prev); - } - if (uNotes.Last().Next != null && uNotes.Last().End == uNotes.Last().Next.position) { - uNotes.Add(uNotes.Last().Next); - } + + var noteIndexes = uNotes + .Select((note, index) => new { note, index }) + .ToDictionary(x => x.note, x => x.index); singer = track.Singer; renderer = track.RendererSettings.Renderer; @@ -231,7 +230,7 @@ internal RenderPhrase(UProject project, UTrack track, UVoicePart part, IEnumerab .Select(n => new RenderNote(project, part, n, position)) .ToArray(); phones = phonemes - .Select(p => new RenderPhone(project, track, part, p.Parent, p, position)) + .Select(p => new RenderPhone(project, track, part, p.Parent, p, position, noteIndexes[p.Parent])) .ToArray(); leading = phones.First().leading; diff --git a/OpenUtau.Core/Util/HTS.cs b/OpenUtau.Core/Util/HTS.cs new file mode 100644 index 000000000..790f6bf65 --- /dev/null +++ b/OpenUtau.Core/Util/HTS.cs @@ -0,0 +1,770 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using OpenUtau.Core.Ustx; + +//This file implement utaupy.hts python library's function +//https://github.com/oatsu-gh/utaupy/hts.py + +//HTS labels use b instead of # +//In HTS labels, "xx" is a preserved keyword that means null +namespace OpenUtau.Core.Util { + public static class HTS { + public static readonly string[] KeysInOctave = { + "C", + "Db", + "D", + "Eb", + "E", + "F", + "Gb", + "G", + "Ab", + "A", + "Bb", + "B" , + }; + + public static readonly Dictionary NameInOctave = new Dictionary { + { "C", 0 }, { "C#", 1 }, { "Db", 1 }, + { "D", 2 }, { "D#", 3 }, { "Eb", 3 }, + { "E", 4 }, + { "F", 5 }, { "F#", 6 }, { "Gb", 6 }, + { "G", 7 }, { "G#", 8 }, { "Ab", 8 }, + { "A", 9 }, { "A#", 10 }, { "Bb", 10 }, + { "B", 11 }, + }; + + public static string GetToneName(int noteNum) { + return noteNum < 0 ? string.Empty : KeysInOctave[noteNum % 12] + (noteNum / 12 - 1).ToString(); + } + + public static string GetOctaveNum(int noteNum) { + NameInOctave.TryGetValue(KeysInOctave[noteNum % 12].ToString(), out int num); + return noteNum < 0 ? string.Empty : num.ToString(); + } + + //return -1 if error + public static int NameToTone(string name) { + if (name.Length < 2) { + return -1; + } + var str = name.Substring(0, (name[1] == '#' || name[1] == 'b') ? 2 : 1); + var num = name.Substring(str.Length); + if (!int.TryParse(num, out int octave)) { + return -1; + } + if (!NameInOctave.TryGetValue(str, out int inOctave)) { + return -1; + } + return 12 * (octave + 1) + inOctave; + } + + public static string WriteInt(int integer) { + return (integer >= 0 ? "p" : "m") + Math.Abs(integer).ToString(); + } + } + + public static class HTSContextBuilder { + public static bool HasPauseLikePhoneme(IEnumerable symbols, Func isPauseLike) { + return symbols.Any(symbol => isPauseLike(symbol.ToLowerInvariant())); + } + + public static HTSNote BuildNote( + string[] symbols, + int tone, + bool isSlur, + string lang, + int key, + TimeAxis timeAxis, + int noteStartTick, + int noteEndTick, + int phraseStartTick, + int startMsOffset, + Func isPauseLike) { + UTimeSignature sig = timeAxis.TimeSignatureAtTick(noteStartTick); + timeAxis.TickPosToBarBeat(noteStartTick, out int bar, out int beat, out int _); + var isRest = HasPauseLikePhoneme(symbols, isPauseLike); + return new HTSNote( + symbols: symbols, + tone: tone, + isSlur: isSlur, + isRest: isRest, + lang: isRest ? string.Empty : lang, + accent: string.Empty, + beatPerBar: sig.beatPerBar, + beatUnit: sig.beatUnit, + positionBar: bar, + positionBeat: beat, + key: key, + bpm: timeAxis.GetBpmAtTick(noteStartTick), + startms: (int)timeAxis.MsBetweenTickPos(phraseStartTick, noteStartTick) + startMsOffset, + endms: (int)timeAxis.MsBetweenTickPos(phraseStartTick, noteEndTick) + startMsOffset, + positionTicks: noteStartTick, + durationTicks: noteEndTick - noteStartTick); + } + + public static int FindFirstVowelIndex(IReadOnlyList symbols, Func isVowel) { + for (int i = 0; i < symbols.Count; i++) { + if (isVowel(symbols[i])) { + return i; + } + } + return 0; + } + + public static List AlignTimingPositions( + IReadOnlyList durationsMs, + IReadOnlyList> phAlignPoints) { + var positions = new List(); + if (durationsMs.Count == 0 || phAlignPoints.Count == 0) { + return positions; + } + var firstCount = Math.Max(0, phAlignPoints[0].Item1 - 1); + var initialGroup = durationsMs.Take(firstCount).ToList(); + positions.AddRange(Stretch(initialGroup, 1, phAlignPoints[0].Item2)); + foreach (var pair in phAlignPoints.Zip(phAlignPoints.Skip(1), Tuple.Create)) { + var currAlignPoint = pair.Item1; + var nextAlignPoint = pair.Item2; + var count = nextAlignPoint.Item1 - currAlignPoint.Item1; + if (count <= 0) { + continue; + } + var alignGroup = durationsMs.Skip(currAlignPoint.Item1).Take(count).ToList(); + if (alignGroup.Count == 0) { + continue; + } + var sum = alignGroup.Sum(); + var ratio = sum == 0 ? 0 : (nextAlignPoint.Item2 - currAlignPoint.Item2) / sum; + positions.AddRange(Stretch(alignGroup, ratio, nextAlignPoint.Item2)); + } + return positions; + } + + public static List> BuildAlignedNoteTimingResult( + IReadOnlyList phonemes, + int startIndex, + int endIndex, + IReadOnlyList positionsMs, + double notePosMs, + Func ticksBetweenMsPos) { + var noteResult = new List>(); + for (int phIndex = startIndex; phIndex < endIndex; ++phIndex) { + if (phIndex < 0 || phIndex >= phonemes.Count) { + continue; + } + var phoneme = phonemes[phIndex]; + if (string.IsNullOrEmpty(phoneme)) { + continue; + } + var positionIndex = phIndex - 1; + if (positionIndex < 0 || positionIndex >= positionsMs.Count) { + continue; + } + noteResult.Add(Tuple.Create( + phoneme, + ticksBetweenMsPos(notePosMs, positionsMs[positionIndex]))); + } + return noteResult; + } + + public static List Stretch(IList source, double ratio, double endPos) { + double startPos = endPos - source.Sum() * ratio; + var result = CumulativeSum(source.Select(x => x * ratio).Prepend(0), startPos).ToList(); + result.RemoveAt(result.Count - 1); + return result; + } + + public static IEnumerable CumulativeSum(IEnumerable sequence, double start = 0) { + double sum = start; + foreach (var item in sequence) { + sum += item; + yield return sum; + } + } + } + + public class HTSPhoneme { + public string symbol; + public string flag1 = "xx"; + public string flag2 = "xx"; + + //Links to this phoneme's neighbors and parent + public HTSPhoneme? prev; + public HTSPhoneme? next; + public HTSNote parent; + + //informations about this phoneme + //v:vowel, c:consonant, p:pause, s:silence, b:break + public string type = "xx"; + //(number of phonemes before this phoneme in this note) + 1 + public int position = 1; + //(number of phonemes after this phoneme in this note) + 1 + public int position_backward = 1; + //Here -1 means null + //distances to vowels in this note, -1 for vowels themselves + public int prev_vowel_distance = 0; + public int next_vowel_distance = 0; + + public HTSPhoneme(string phoneme, HTSNote note) { + this.symbol = phoneme; + this.parent = note; + } + + public HTSPhoneme? beforePrev { + get { + if (prev == null) { return null; } else { return prev.prev; } + } + } + + public HTSPhoneme? afterNext { + get { + if (next == null) { return null; } else { return next.next; } + } + } + + public string dump() { + //Write phoneme as an HTS line + // 100ns単位出力時にintオーバーフローを避けるためlongへ + string result = + $"{(long)Math.Round(parent.startMs * 10000.0)} {(long)Math.Round(parent.endMs * 10000.0)} " + //Phoneme informations + + string.Format("{0}@{1}^{2}-{3}+{4}={5}_{6}%{7}^{8}_{9}~{10}-{11}!{12}[{13}${14}]{15}", p()) + //Syllable informations + + string.Format("/A:{0}-{1}-{2}@{3}~{4}", a()) + + string.Format("/B:{0}_{1}_{2}@{3}|{4}", b()) + + string.Format("/C:{0}+{1}+{2}@{3}&{4}", c()) + //Note informations + + string.Format("/D:{0}!{1}#{2}${3}%{4}|{5}&{6};{7}-{8}", d()) + + string.Format( + "/E:{0}]{1}^{2}={3}~{4}!{5}@{6}#{7}+{8}]{9}${10}|{11}[{12}&{13}]{14}={15}^{16}~{17}#{18}_{19};{20}${21}&{22}%{23}[{24}|{25}]{26}-{27}^{28}+{29}~{30}={31}@{32}${33}!{34}%{35}#{36}|{37}|{38}-{39}&{40}&{41}+{42}[{43};{44}]{45};{46}~{47}~{48}^{49}^{50}@{51}[{52}#{53}={54}!{55}~{56}+{57}!{58}^{59}", + e()) + + string.Format("/F:{0}#{1}#{2}-{3}${4}${5}+{6}%{7};{8}", f()) + + string.Format("/G:{0}_{1}", g()) + + string.Format("/H:{0}_{1}", h()) + + string.Format("/I:{0}_{1}", i()) + + string.Format("/J:{0}~{1}@{2}", j()) + ; + return result; + } + + public string[] p() { + var result = Enumerable.Repeat("xx", 16).ToArray(); + result[0] = type; + result[1] = (beforePrev == null) ? "xx" : beforePrev.symbol; + result[2] = (prev == null) ? "xx" : prev.symbol; + result[3] = symbol; + result[4] = (next == null) ? "xx" : next.symbol; + result[5] = (afterNext == null) ? "xx" : afterNext.symbol; + result[6] = (beforePrev == null) ? "xx" : beforePrev.flag1; + result[7] = (prev == null) ? "xx" : prev.flag1; + result[8] = flag1; + result[9] = (next == null) ? "xx" : next.flag1; + result[10] = (afterNext == null) ? "xx" : afterNext.flag1; + result[11] = position.ToString(); + result[12] = position_backward.ToString(); + result[13] = prev_vowel_distance == 0 ? "xx" : prev_vowel_distance.ToString(); + result[14] = next_vowel_distance == 0 ? "xx" : next_vowel_distance.ToString(); + result[15] = flag2; + + return result; + } + + public string[] a() { + return parent.a(); + } + + public string[] b() { + return parent.b(); + } + + public string[] c() { + return parent.c(); + } + + public string[] d() { + return parent.d(); + } + + public string[] e() { + return parent.e(); + } + + public string[] f() { + return parent.f(); + } + + public string[] g() { + return parent.g(); + } + + public string[] h() { + return parent.h(); + } + + public string[] i() { + return parent.i(); + } + + public string[] j() { + return parent.j(); + } + } + + // TODO: Keep HTS note-context generation centralized here. + // Remaining E-context slots that stay "xx" today should only be filled after + // their HTS/NEUTRINO semantics are confirmed against the target implementation. + public class HTSNote { + public double startMs = 0; + public double endMs = 0; + public int positionTicks; + public int durationTicks = 0; + public int index = 0;//index of this note in sentence + public int indexBackwards = 0; + public double sentenceDurMs = 0; + public int sentenceDurTicks = 0; + public double startMsPercent = 0; + + //TimeSignatures + public int beatPerBar = 0; + public int beatUnit = 0; + + public int positionBar = 1; //bar number in the sentence, starting from 1 + public int positionBeat = 1; //unit number in the bar, starting from 1 + + public double key = 0; + public double bpm = 0; + public int tone = 0; + public bool isSlur = false; + public bool isRest = true; + public string[] symbols; + public string lang = string.Empty; + public string langDependent = "xx"; + public string accent = string.Empty; + + public HTSNote? prev; + public HTSNote? next; + public HTSPhrase parent; + + public HTSNote(string[] symbols, int beatPerBar, int beatUnit, int positionBar, int positionBeat, int key, double bpm, int tone, bool isSlur, bool isRest, string lang, string accent, double startms, double endms, int positionTicks, int durationTicks) { + this.startMs = startms; + this.endMs = endms; + this.beatPerBar = beatPerBar; + this.beatUnit = beatUnit; + this.positionBar = positionBar; + this.positionBeat = positionBeat; + this.key = key; + this.bpm = bpm; + this.tone = tone; + this.isSlur = isSlur; + this.isRest = isRest; + this.lang = lang; + this.accent = accent; + this.symbols = symbols; + this.positionTicks = positionTicks; + this.durationTicks = durationTicks; + } + + public double durationMs { + get { return endMs - startMs; } + } + + private double startMsBackwards { + get { return sentenceDurMs - startMs; } + } + + private int positionTickBackwards { + get { return sentenceDurTicks - positionTicks; } + } + + + public int? measureIndexForward; + public double? measureMsForward; + public int? measureTickForward; + public int? measurePercentForward; + public int? measureIndexBackward; + public double? measureMsBackward; + public int? measureTickBackward; + public int? measurePercentBackward; + + public int? accentIndexForward; + public double? accentMsForward; + public int? accentTickForward; + public int? accentIndexBackward; + public double? accentMsBackward; + public int? accentTickBackward; + + public string[] a() { + if (prev == null) { + return Enumerable.Repeat("xx", 5).ToArray(); + } else if (prev.isRest) { + return Enumerable.Repeat("xx", 5).ToArray(); + } else { + return prev.b(); + } + } + + public string[] b() { + return new string[] { + symbols.Length.ToString(), + "1", + "1", + lang != string.Empty ? lang : "xx", + langDependent, + }; + } + + public string[] c() { + if (next == null) { + return Enumerable.Repeat("xx", 5).ToArray(); + } else if (next.isRest) { + return Enumerable.Repeat("xx", 5).ToArray(); + } else { + return next.b(); + } + } + + public string[] d() { + if (prev == null) { + return Enumerable.Repeat("xx", 60).ToArray(); + } else if (prev.isRest) { + return Enumerable.Repeat("xx", 60).ToArray(); + } else { + return prev.e(); + } + } + + public string[] e() { + var result = Enumerable.Repeat("xx", 60).ToArray(); + result[0] = isRest ? "xx" : HTS.GetToneName(tone); + result[1] = isRest ? "xx" : HTS.GetOctaveNum(tone); + result[2] = ((int)Math.Round(key)).ToString(); + result[3] = $"{beatPerBar}/{beatUnit}"; + result[4] = ((int)Math.Round(bpm)).ToString(); + result[5] = "1"; + + int lengthCs = Math.Max(0, (int)Math.Round(durationMs / 10.0)); + int ticksPer96th = (parent != null && parent.resolution > 0) ? parent.resolution / 24 : 0; + int length96 = (ticksPer96th > 0) ? (int)Math.Round((double)durationTicks / ticksPer96th) : 0; + result[6] = lengthCs.ToString(); + result[7] = length96.ToString(); + + result[9] = measureIndexForward != null ? measureIndexForward.ToString() : "xx"; // e10 + result[10] = measureIndexBackward != null ? measureIndexBackward.ToString() : "xx"; // e11 + result[11] = measureMsForward != null ? ((int)Math.Round(measureMsForward.Value)).ToString() : "xx"; // e12 (centisecond already) + result[12] = measureMsBackward != null ? ((int)Math.Round(measureMsBackward.Value)).ToString() : "xx"; // e13 + result[13] = measureTickForward != null ? measureTickForward.ToString() : "xx"; // e14 (96th already) + result[14] = measureTickBackward != null ? measureTickBackward.ToString() : "xx"; // e15 + result[15] = measurePercentForward != null ? measurePercentForward.ToString() : "xx"; // e16 + result[16] = measurePercentBackward != null ? measurePercentBackward.ToString() : "xx"; // e17 + + if (!isRest) { + result[17] = index <= 0 ? "xx" : index.ToString(); + result[18] = indexBackwards <= 0 ? "xx" : indexBackwards.ToString(); + result[19] = ((int)Math.Round(startMs / 10)).ToString(); // 10ms単位 + result[20] = ((int)Math.Round(startMsBackwards / 10)).ToString(); + + // e22/e23: phrase-level position by 96th note, resolution independent + if (ticksPer96th > 0 && parent != null && parent.notes != null && index > 0) { + int firstPhraseTick = parent.notes + .Select(note => note.positionTicks) + .DefaultIfEmpty(positionTicks) + .Min(); + int lastPhraseTick = parent.notes + .Select(note => note.positionTicks) + .DefaultIfEmpty(positionTicks) + .Max(); + int forwardTicks = Math.Max(0, positionTicks - firstPhraseTick); + int backwardTicks = Math.Max(0, lastPhraseTick - positionTicks); + result[21] = ((forwardTicks + ticksPer96th / 2) / ticksPer96th).ToString(); + result[22] = ((backwardTicks + ticksPer96th / 2) / ticksPer96th).ToString(); + } else { + result[21] = "xx"; + result[22] = "xx"; + } + + int totalNotes = parent?.totalNotes ?? 0; + if (totalNotes > 1) { + result[23] = ((index - 1) * 100 / (totalNotes - 1)).ToString(); + result[24] = ((indexBackwards - 1) * 100 / (totalNotes - 1)).ToString(); + } else { + result[23] = "xx"; + result[24] = "xx"; + } + + } + + if (prev != null) { + result[25] = prev.isSlur && isSlur ? "1" : "0"; + } else { + result[25] = "0"; + } + if (next != null) { + result[26] = next.isSlur && isSlur ? "1" : "0"; + } else { + result[26] = "0"; + } + result[27] = "n"; + result[28] = accentIndexBackward.HasValue ? accentIndexBackward.Value.ToString() : "xx"; + result[29] = accentIndexForward.HasValue ? accentIndexForward.Value.ToString() : "xx"; + result[30] = accentMsBackward.HasValue ? ((int)Math.Round(accentMsBackward.Value / 10.0)).ToString() : "xx"; + result[31] = accentMsForward.HasValue ? ((int)Math.Round(accentMsForward.Value / 10.0)).ToString() : "xx"; + result[32] = (accentTickBackward.HasValue && ticksPer96th > 0) ? ((int)Math.Round((double)accentTickBackward.Value / ticksPer96th)).ToString() : "xx"; + result[33] = (accentTickForward.HasValue && ticksPer96th > 0) ? ((int)Math.Round((double)accentTickForward.Value / ticksPer96th)).ToString() : "xx"; + + // TODO: e34-e56 remain intentionally "xx" until OpenUtau adopts a + // verified mapping for staccato / crescendo / decrescendo related + // score-label contexts. Keep current behavior visible instead of + // guessing values from timing-label-only information. + + if (!isRest && this.tone > 0) { + result[56] = (prev == null || prev.isRest || prev.tone <= 0) ? "xx" : HTS.WriteInt(prev.tone - tone); + result[57] = (next == null || next.isRest || next.tone <= 0) ? "xx" : HTS.WriteInt(next.tone - tone); + } else { + result[56] = "xx"; + result[57] = "xx"; + } + return result; + } + + public string[] f() { + if (next == null) { + return Enumerable.Repeat("xx", 60).ToArray(); + } else if (next.isRest) { + return Enumerable.Repeat("xx", 60).ToArray(); + } else { + return next.e(); + } + } + + public string[] g() { + //TODO Calculate using HTSPhrase + if (prev != null) { + if (isRest) { + return prev.h(); + } + } + return parent.g(); + } + + public string[] h() { + // TODO Calculate using HTSPhrase + if (isRest) { + return Enumerable.Repeat("xx", 2).ToArray(); + } + return parent.h(); + } + + public string[] i() { + //TODO Calculate using HTSPhrase + if (next != null) { + if (isRest) { + return next.h(); + } + } + return parent.i(); + } + + public string[] j() { + return parent.j(); + } + } + + public class HTSPhrase { + public int resolution = 480; + public int totalPhrases; + public int totalNotes; + public int totalPhonemes; + + public HTSPhrase? prev; + public HTSPhrase? next; + public HTSNote[] notes; + + public HTSPhrase(HTSNote[] notes) { + this.notes = notes; + RecalculateDerivedContexts(); + } + + public void UpdateResolution(int resolution) { + this.resolution = resolution; + RecalculateDerivedContexts(); + } + + void RecalculateDerivedContexts() { + foreach (var note in notes) { + note.accentIndexForward = null; + note.accentMsForward = null; + note.accentTickForward = null; + note.accentIndexBackward = null; + note.accentMsBackward = null; + note.accentTickBackward = null; + note.measureIndexForward = null; + note.measureMsForward = null; + note.measureTickForward = null; + note.measurePercentForward = null; + note.measureIndexBackward = null; + note.measureMsBackward = null; + note.measureTickBackward = null; + note.measurePercentBackward = null; + } + + // アクセント(forward) + int accentIndexForwardSum = 0; + double accentMsForwardSum = 0; + int accentTickForwardSum = 0; + for (int i = 0; i < notes.Length; i++) { + var note = notes[i]; + if (note.isRest) { + accentIndexForwardSum = 0; + accentMsForwardSum = 0; + accentTickForwardSum = 0; + } else if (!string.IsNullOrEmpty(note.accent)) { + note.accentIndexForward = 0; + note.accentMsForward = 0; + note.accentTickForward = 0; + + accentIndexForwardSum = 1; + accentMsForwardSum = note.durationMs; + accentTickForwardSum = note.durationTicks; + } else { + if (accentIndexForwardSum != 0) { + note.accentIndexForward = accentIndexForwardSum; + accentIndexForwardSum += 1; + } + if (accentMsForwardSum != 0) { + note.accentMsForward = accentMsForwardSum; + accentMsForwardSum += note.durationMs; + } + if (accentTickForwardSum != 0) { + note.accentTickForward = accentTickForwardSum; + accentTickForwardSum += note.durationTicks; + } + } + } + + // アクセント(backward) + int accentIndexBackwardSum = 0; + double accentMsBackwardSum = 0; + int accentTickBackwardSum = 0; + int lastAccentIndexContribution = 0; + double lastAccentMs = 0; + int lastAccentTicks = 0; + for (int i = notes.Length - 1; i >= 0; i--) { + var note = notes[i]; + if (note.isRest) { + accentIndexBackwardSum = 0; + accentMsBackwardSum = 0; + accentTickBackwardSum = 0; + lastAccentIndexContribution = 0; + lastAccentMs = 0; + lastAccentTicks = 0; + } else if (!string.IsNullOrEmpty(note.accent)) { + note.accentIndexBackward = Math.Max(0, accentIndexBackwardSum - lastAccentIndexContribution); + note.accentMsBackward = Math.Max(0, accentMsBackwardSum - lastAccentMs); + note.accentTickBackward = Math.Max(0, accentTickBackwardSum - lastAccentTicks); + + lastAccentIndexContribution = 1; + lastAccentMs = note.durationMs; + lastAccentTicks = note.durationTicks; + + accentIndexBackwardSum = 1; + accentMsBackwardSum = note.durationMs; + accentTickBackwardSum = note.durationTicks; + } else { + if (accentIndexBackwardSum != 0) { + note.accentIndexBackward = accentIndexBackwardSum; + accentIndexBackwardSum += 1; + } + if (accentMsBackwardSum != 0) { + note.accentMsBackward = accentMsBackwardSum; + accentMsBackwardSum += note.durationMs; + } + if (accentTickBackwardSum != 0) { + note.accentTickBackward = accentTickBackwardSum; + accentTickBackwardSum += note.durationTicks; + } + + } + } + + // 小節ごとのグルーピング(positionBar 基準) + var groups = notes + .GroupBy(n => n.positionBar) + .OrderBy(g => g.Key) + .Select(g => g.OrderBy(n => n.positionTicks).ToList()) + .ToList(); + + int ticksPer96th = (resolution > 0) ? (resolution / 24) : 0; + + foreach (var group in groups) { + double totalDurationMs = group.Sum(n => n.durationMs); + int totalDurationTicks = group.Sum(n => n.durationTicks); + int totalNotesInMeasure = group.Count; + // forward(小節先頭からの位置) + double accMsF = 0; + int accTicksF = 0; + for (var noteIndex = 0; noteIndex < group.Count; noteIndex++) { + var note = group[noteIndex]; + note.measureIndexForward = noteIndex + 1; + note.measureMsForward = (int)Math.Round(accMsF / 100.0); + note.measureTickForward = ticksPer96th > 0 ? (int)Math.Round((double)accTicksF / ticksPer96th) : 0; + note.measurePercentForward = totalNotesInMeasure > 1 ? (noteIndex * 100) / (totalNotesInMeasure - 1) : 0; + + accMsF += note.durationMs; + accTicksF += note.durationTicks; + } + + // backward + double accMsB = 0; + int accTicksB = 0; + for (int noteIndex = group.Count - 1; noteIndex >= 0; --noteIndex) { + var note = group[noteIndex]; + int backwardIndex = group.Count - noteIndex; + note.measureIndexBackward = backwardIndex; + note.measureMsBackward = (int)Math.Round(accMsB / 100.0); + note.measureTickBackward = ticksPer96th > 0 ? (int)Math.Round((double)accTicksB / ticksPer96th) : 0; + note.measurePercentBackward = totalNotesInMeasure > 1 ? ((backwardIndex - 1) * 100) / (totalNotesInMeasure - 1) : 0; + + accMsB += note.durationMs; + accTicksB += note.durationTicks; + } + } + } + private int barCount { + get { return notes[^1].positionBar - notes[0].positionBar + 1; } + } + + public string[] g() { + var result = Enumerable.Repeat("xx", 2).ToArray(); + if (prev == null) { + return result; + } else { + return prev.h(); + } + } + + public string[] h() { + var result = Enumerable.Repeat("xx", 2).ToArray(); + result[0] = notes.Length.ToString(); + result[1] = notes.Select(note => note.symbols.Length).Sum().ToString(); + return result; + } + + public string[] i() { + var result = Enumerable.Repeat("xx", 2).ToArray(); + if (next == null) { + return result; + } else { + return next.h(); + } + } + + public string[] j() { + var result = Enumerable.Repeat("xx", 3).ToArray(); + result[0] = (barCount > 0 ? (totalNotes / barCount).ToString() : "xx"); + result[1] = (barCount > 0 ? (totalPhonemes / barCount).ToString() : "xx"); + result[2] = totalPhrases.ToString(); + return result; + } + } +} diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/HTSLabelFile.cs b/OpenUtau.Core/Util/HTSLabelFile.cs similarity index 99% rename from OpenUtau.Plugin.Builtin/EnunuOnnx/HTSLabelFile.cs rename to OpenUtau.Core/Util/HTSLabelFile.cs index 87fd0028a..944c4f844 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/HTSLabelFile.cs +++ b/OpenUtau.Core/Util/HTSLabelFile.cs @@ -5,10 +5,10 @@ using System.Collections; using System.IO; using System.Text.RegularExpressions; -using OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.python; +using OpenUtau.Core.Util.nnmnkwii.python; //reference: https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/io/hts.py -namespace OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.io.hts { +namespace OpenUtau.Core.Util.nnmnkwii.io.hts { public class HTSLabel { public int start_time = 0; public int end_time = 0; diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/Merlin.cs b/OpenUtau.Core/Util/Merlin.cs similarity index 98% rename from OpenUtau.Plugin.Builtin/EnunuOnnx/Merlin.cs rename to OpenUtau.Core/Util/Merlin.cs index cfb874872..59da9dc0d 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/Merlin.cs +++ b/OpenUtau.Core/Util/Merlin.cs @@ -2,10 +2,11 @@ using System.Collections.Generic; using System.Text.RegularExpressions; using System.Linq; -using OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.io.hts; +using OpenUtau.Core.Util; +using OpenUtau.Core.Util.nnmnkwii.io.hts; //reference: https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/frontend/merlin.py -namespace OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.frontend { +namespace OpenUtau.Core.Util.nnmnkwii.frontend { public class merlin { //TODO:Should subphone_features be an enum? static Dictionary frame_feature_size_dict = new Dictionary diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/Python.cs b/OpenUtau.Core/Util/Python.cs similarity index 94% rename from OpenUtau.Plugin.Builtin/EnunuOnnx/Python.cs rename to OpenUtau.Core/Util/Python.cs index 9a27970d1..a8306a96d 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/Python.cs +++ b/OpenUtau.Core/Util/Python.cs @@ -1,7 +1,7 @@ using System; using System.Text.RegularExpressions; -namespace OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.python { +namespace OpenUtau.Core.Util.nnmnkwii.python { public class AssertionError : Exception { public AssertionError() : base() { } diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/Scaler.cs b/OpenUtau.Core/Util/Scaler.cs similarity index 97% rename from OpenUtau.Plugin.Builtin/EnunuOnnx/Scaler.cs rename to OpenUtau.Core/Util/Scaler.cs index 6201fcfe5..39a5b303f 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/Scaler.cs +++ b/OpenUtau.Core/Util/Scaler.cs @@ -4,7 +4,7 @@ using System.Text; using Newtonsoft.Json; -namespace OpenUtau.Plugin.Builtin.EnunuOnnx { +namespace OpenUtau.Core.Util { public class ScalerLine { public float xmin; public float scale; diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs b/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs index 8c3b74a07..d9c664bcf 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs +++ b/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs @@ -9,9 +9,10 @@ using OpenUtau.Api; using OpenUtau.Core; using OpenUtau.Core.Ustx; +using OpenUtau.Core.Util; +using OpenUtau.Core.Util.nnmnkwii.frontend; +using OpenUtau.Core.Util.nnmnkwii.io.hts; using OpenUtau.Plugin.Builtin.EnunuOnnx; -using OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.frontend; -using OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.io.hts; using Serilog; //This phonemizer is a pure C# implemention of the ENUNU phonemizer, @@ -280,9 +281,21 @@ string[] GetSymbols(Note note) { //make a HTS Note from given symbols and UNotes protected HTSNote makeHtsNote(string[] symbols, IList group, int startTick) { + UTimeSignature sig = timeAxis.TimeSignatureAtTick(group[0].position); + timeAxis.TickPosToBarBeat(group[0].position, out int bar, out int beat, out int remainingTicks); return new HTSNote( symbols: symbols, tone: group[0].tone, + isSlur: IsSyllableVowelExtensionNote(group[0]), + isRest: symbols.Select(x => x.ToLowerInvariant()).Any(x => pauses.Contains(x) || silences.Contains(x) || breaks.Contains(x)), + beatPerBar: sig.beatPerBar, + beatUnit: sig.beatUnit, + positionBar: bar, + positionBeat: beat, + key: 0, + lang: string.Empty, + accent: string.Empty, + bpm: timeAxis.GetBpmAtTick(group[0].position), startms: (int)timeAxis.MsBetweenTickPos(startTick, group[0].position) + paddingMs, endms: (int)timeAxis.MsBetweenTickPos(startTick, group[^1].position + group[^1].duration) + paddingMs, positionTicks: group[0].position, @@ -439,26 +452,32 @@ protected virtual HTSNote[] MakeSyllables(Note[] inputNotes, int startTick) { HTSPhoneme[] HTSNoteToPhonemes(HTSNote htsNote) { var htsPhonemes = htsNote.symbols.Select(x => new HTSPhoneme(x, htsNote)).ToArray(); - int prevVowelPos = -1; foreach (int i in Enumerable.Range(0, htsPhonemes.Length)) { + htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); htsPhonemes[i].position = i + 1; htsPhonemes[i].position_backward = htsPhonemes.Length - i; - htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); - if (htsPhonemes[i].type == "v") { - prevVowelPos = i; - } else { - if (prevVowelPos > 0) { - htsPhonemes[i].distance_from_previous_vowel = i - prevVowelPos; + } + foreach (int i in Enumerable.Range(0, htsPhonemes.Length)) { + if (htsPhonemes[i].type.Equals("c")) { + int next = i + 1; + if (next < htsPhonemes.Length) { + if (htsPhonemes[next].type.Equals("v")) { + htsPhonemes[i].next_vowel_distance = 1; + } else { + htsPhonemes[i].next_vowel_distance = htsPhonemes[next].next_vowel_distance + 1; + } } } } - int nextVowelPos = -1; for (int i = htsPhonemes.Length - 1; i > 0; --i) { - if (htsPhonemes[i].type == "v") { - nextVowelPos = i; - } else { - if (nextVowelPos > 0) { - htsPhonemes[i].distance_to_next_vowel = nextVowelPos - i; + if (htsPhonemes[i].type.Equals("c")) { + int prev = i - 1; + if (prev >= 0) { + if (htsPhonemes[prev].type.Equals("v")) { + htsPhonemes[i].prev_vowel_distance = 1; + } else { + htsPhonemes[i].prev_vowel_distance = htsPhonemes[prev].prev_vowel_distance + 1; + } } } } @@ -473,9 +492,21 @@ void ProcessPart(Note[][] phrase) { int paddingTicks = timeAxis.MsPosToTickPos(paddingMs); var notePhIndex = new List { 1 };//每个音符的第一个音素在音素列表上对应的位置 var phAlignPoints = new List>();//音素对齐的位置,Ms,绝对时间 + UTimeSignature sig = timeAxis.TimeSignatureAtTick(phrase[0][0].position - paddingTicks); + timeAxis.TickPosToBarBeat(phrase[0][0].position - paddingTicks, out int bar, out int beat, out int remainingTicks); HTSNote PaddingNote = new HTSNote( - symbols: new string[] { "sil" }, + symbols: new string[] { defaultPause }, + beatPerBar: sig.beatPerBar, + beatUnit: sig.beatUnit, + positionBar: bar, + positionBeat: beat, + key: 0, + bpm: 0, tone: 0, + isSlur: false, + isRest: true, + lang: string.Empty, + accent: string.Empty, startms: 0, endms: paddingMs, positionTicks: phrase[0][0].position - paddingTicks, @@ -515,11 +546,12 @@ void ProcessPart(Note[][] phrase) { htsPhonemes.Count, timeAxis.TickPosToMsPos(lastNote.positionTicks + lastNote.durationTicks))); + var htsPhrase = new HTSPhrase(htsNotes.ToArray()); + htsPhrase.totalNotes = htsNotes.Count; + htsPhrase.totalPhonemes = htsPhonemes.Count; //make neighborhood links between htsNotes and between htsPhonemes foreach (int i in Enumerable.Range(0, htsNotes.Count)) { - htsNotes[i].index = i; - htsNotes[i].indexBackwards = htsNotes.Count - i; - htsNotes[i].sentenceDurMs = sentenceDurMs; + htsNotes[i].parent = htsPhrase; if (i > 0) { htsNotes[i].prev = htsNotes[i - 1]; htsNotes[i - 1].next = htsNotes[i]; diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/HTS.cs b/OpenUtau.Plugin.Builtin/EnunuOnnx/HTS.cs deleted file mode 100644 index ffe0750b6..000000000 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/HTS.cs +++ /dev/null @@ -1,256 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; - -//This file implement utaupy.hts python library's function -//https://github.com/oatsu-gh/utaupy/blob/master/utaupy/hts.py - -//HTS labels use b instead of # -//In HTS labels, "xx" is a preserved keyword that means null -namespace OpenUtau.Plugin.Builtin.EnunuOnnx { - public static class HTS { - public static readonly string[] KeysInOctave = { - "C", - "Db", - "D", - "Eb", - "E", - "F", - "Gb", - "G", - "Ab", - "A", - "Bb", - "B" , - }; - - public static readonly Dictionary NameInOctave = new Dictionary { - { "C", 0 }, { "C#", 1 }, { "Db", 1 }, - { "D", 2 }, { "D#", 3 }, { "Eb", 3 }, - { "E", 4 }, - { "F", 5 }, { "F#", 6 }, { "Gb", 6 }, - { "G", 7 }, { "G#", 8 }, { "Ab", 8 }, - { "A", 9 }, { "A#", 10 }, { "Bb", 10 }, - { "B", 11 }, - }; - - public static string GetToneName(int noteNum) { - return noteNum < 0 ? string.Empty : KeysInOctave[noteNum % 12] + (noteNum / 12 - 1).ToString(); - } - - //return -1 if error - public static int NameToTone(string name) { - if (name.Length < 2) { - return -1; - } - var str = name.Substring(0, (name[1] == '#' || name[1] == 'b') ? 2 : 1); - var num = name.Substring(str.Length); - if (!int.TryParse(num, out int octave)) { - return -1; - } - if (!NameInOctave.TryGetValue(str, out int inOctave)) { - return -1; - } - return 12 * (octave + 1) + inOctave; - } - - //write integer with "p" as positive and "n" as negative. 0 is "p0" - public static string WriteInt(int integer) { - return (integer >= 0 ? "p":"m" )+Math.Abs(integer).ToString(); - } - } - - public class HTSPhoneme{ - public string symbol; - - //Links to this phoneme's neighbors and parent - public HTSPhoneme? prev; - public HTSPhoneme? next; - public HTSNote parent; - - //informations about this phoneme - //v:vowel, c:consonant, p:pause, s:silence, b:break - public string type = "xx"; - //(number of phonemes before this phoneme in this note) + 1 - public int position = 1; - //(number of phonemes after this phoneme in this note) + 1 - public int position_backward = 1; - //Here -1 means null - //distances to vowels in this note, -1 for vowels themselves - public int distance_from_previous_vowel = -1; - public int distance_to_next_vowel = -1; - - public HTSPhoneme(string phoneme, HTSNote note) { - this.symbol = phoneme; - this.parent = note; - } - - public HTSPhoneme? beforePrev { - get { - if (prev == null) { return null; } else { return prev.prev;} - } - } - - public HTSPhoneme? afterNext { - get { - if (next == null) { return null; } else { return next.next; } - } - } - - public string dump() { - //Write phoneme as an HTS line - - string result = - $"{parent.startMs * 100000} {parent.endMs * 100000} " - //Phoneme informations - + string.Format("{0}@{1}^{2}-{3}+{4}={5}_{6}%{7}^{8}_{9}~{10}-{11}!{12}[{13}${14}]{15}", p()) - //Syllable informations - + string.Format("/A:{0}-{1}-{2}@{3}~{4}", a()) - + string.Format("/B:{0}_{1}_{2}@{3}|{4}", b()) - + string.Format("/C:{0}+{1}+{2}@{3}&{4}", c()) - //Note informations - + string.Format("/D:{0}!{1}#{2}${3}%{4}|{5}&{6};{7}-{8}", d()) - + string.Format( - "/E:{0}]{1}^{2}={3}~{4}!{5}@{6}#{7}+{8}]{9}${10}|{11}[{12}&{13}]{14}={15}^{16}~{17}#{18}_{19};{20}${21}&{22}%{23}[{24}|{25}]{26}-{27}^{28}+{29}~{30}={31}@{32}${33}!{34}%{35}#{36}|{37}|{38}-{39}&{40}&{41}+{42}[{43};{44}]{45};{46}~{47}~{48}^{49}^{50}@{51}[{52}#{53}={54}!{55}~{56}+{57}!{58}^{59}", - e()) - +string.Format("/F:{0}#{1}#{2}-{3}${4}${5}+{6}%{7};{8}",f()) - + "/G:xx_xx/H:xx_xx/I:xx_xx/J:xx~xx@1" - ; - return result; - } - - public string[] p() { - var result = Enumerable.Repeat("xx",16).ToArray(); - result[0] = type; - result[1] = (beforePrev == null) ? "xx" : beforePrev.symbol; - result[2] = (prev == null) ? "xx" : prev.symbol; - result[3] = symbol; - result[4] = (next == null) ? "xx" : next.symbol; - result[5] = (afterNext == null) ? "xx" : afterNext.symbol; - result[11] = position.ToString(); - result[12] = position_backward.ToString(); - result[13] = distance_from_previous_vowel < 0 ? "xx" : distance_from_previous_vowel.ToString(); - result[14] = distance_to_next_vowel < 0 ? "xx" : distance_to_next_vowel.ToString(); - return result; - } - - public string[] a() { - return parent.a(); - } - - public string[] b() { - return parent.b(); - } - - public string[] c() { - return parent.c(); - } - - public string[] d() { - return parent.d(); - } - - public string[] e() { - return parent.e(); - } - - public string[] f() { - return parent.f(); - } - } - - //TODO - public class HTSNote { - public int startMs = 0; - public int endMs = 0; - public int positionTicks; - public int durationTicks = 0; - public int index = 0;//index of this note in sentence - public int indexBackwards = 0; - public int sentenceDurMs = 0; - - public int tone = 0; - public string[] symbols; - - public HTSNote? prev; - public HTSNote? next; - - public HTSNote(string[] symbols, int tone, int startms,int endms,int positionTicks, int durationTicks) { - this.startMs = startms; - this.endMs = endms; - this.tone = tone; - this.symbols = symbols; - this.positionTicks = positionTicks; - this.durationTicks = durationTicks; - } - - public int durationMs { - get { return endMs - startMs; } - } - - public int startMsBackwards { - get { return sentenceDurMs - startMs; } - } - - public string[] b() { - return new string[] { - symbols.Length.ToString(), - "1", - "1", - "xx", - "xx" - }; - } - - public string[] a() { - if (prev == null) { - return Enumerable.Repeat("xx", 5).ToArray(); - } else { - return prev.b(); - } - } - - public string[] c() { - if (next == null) { - return Enumerable.Repeat("xx", 5).ToArray(); - } else { - return next.b(); - } - } - - public string[] e() { - var result = Enumerable.Repeat("xx", 60).ToArray(); - result[0] = HTS.GetToneName(tone); - result[5] = "1";//number_of_syllables - result[6] = ((durationMs + 5) / 10).ToString();//duration in 10ms - result[7] = ((durationTicks + 10) / 20).ToString(); //length in 96th note, or 20 ticks - result[17] = index <= 0 ? "xx" : index.ToString();//index of note in sentence - result[18] = indexBackwards <= 0 ? "xx" : indexBackwards.ToString(); - result[19] = ((startMs + 50) / 100).ToString();//position in 100ms - result[20] = ((startMsBackwards + 50) / 100).ToString(); - if (this.tone > 0) { - result[56] = (prev == null || prev.tone <= 0) ? "p0" : HTS.WriteInt(prev.tone - tone); - result[57] = (next == null || next.tone <= 0) ? "p0" : HTS.WriteInt(next.tone - tone); - } else { - result[56] = "p0"; - result[57] = "p0"; - } - return result; - } - - public string[] d() { - if(prev == null) { - return Enumerable.Repeat("xx", 60).ToArray(); - } else { - return prev.e(); - } - } - public string[] f() { - if (next == null) { - return Enumerable.Repeat("xx", 60).ToArray(); - } else { - return next.e(); - } - } - } -} diff --git a/OpenUtau.Test/Core/Util/HtsSpecTests.cs b/OpenUtau.Test/Core/Util/HtsSpecTests.cs new file mode 100644 index 000000000..2b66cb46d --- /dev/null +++ b/OpenUtau.Test/Core/Util/HtsSpecTests.cs @@ -0,0 +1,311 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Reflection; +using System.Text; +using System.Text.RegularExpressions; +using OpenUtau.Core.Ustx; +using OpenUtau.Core.Util.nnmnkwii.frontend; +using OpenUtau.Core.Util.nnmnkwii.io.hts; +using Xunit; + +namespace OpenUtau.Core.Util { + public class HtsSpecTests { + private static readonly Regex CurrentPhonemePattern = new(@"^[^@]+@[^\^]+\^[^-]+-(?[^+]+)\+", RegexOptions.Compiled); + protected Dictionary phoneDict = new Dictionary(); + protected List vowels = new List() {"a","i","u","e","o" }; + protected List consonants = new List() {"k","s","t","n","h","m","y","r","w","g","z","d","b","p" }; + protected List breaks = new List(); + protected List pauses = new List() { "pau", "sil" }; + protected List silences = new List(); + protected List unvoiced = new List(); + + private string GetPhonemeType(string phoneme) { + if (phoneme == "xx") { + return "xx"; + } + if (vowels.Contains(phoneme)) { + return "v"; + } + if (pauses.Contains(phoneme)) { + return "p"; + } + if (silences.Contains(phoneme)) { + return "s"; + } + if (breaks.Contains(phoneme)) { + return "b"; + } + //if (unvoiced.Contains(phoneme)) { + // return "c"; + //} + return "c"; + } + + private HTSNote MakeNote(int startMs, int endMs, int positionTicks, int durationTicks, int positionBar, string accent = "") { + var symbols = new[] { "a" }; + var beatPerBar = 4; + var beatUnit = 4; + var key = 0; + double bpm = 120; + var tone = 60; // C4 + var isSlur = false; + var isRest = false; + var lang = "JPN"; + var accentStr = accent; + var note = new HTSNote(symbols, beatPerBar, beatUnit, positionBar, 0, key, bpm, tone, isSlur, isRest, lang, accentStr, startMs, endMs, positionTicks, durationTicks); + return note; + } + + private HTSPhrase BuildPhrase(HTSNote[] notes, int resolution) { + var phrase = new HTSPhrase(notes); + phrase.UpdateResolution(resolution); + var sentenceDurMs = notes.Sum(n => n.durationMs); + var sentenceDurTicks = notes.Sum(n => n.durationTicks); + for (var i = 0; i < notes.Length; i++) { + var n = notes[i]; + n.parent = phrase; + n.index = i + 1; + n.indexBackwards = notes.Length - i; + n.sentenceDurMs = sentenceDurMs; + n.sentenceDurTicks = sentenceDurTicks; + if (i > 0) { + notes[i - 1].next = n; + n.prev = notes[i - 1]; + } + } + return phrase; + } + + private TimeAxis BuildDefaultTimeAxis() { + var timeAxis = new TimeAxis(); + var project = new UProject(); + timeAxis.BuildSegments(project); + return timeAxis; + } + + [Fact] + public void MeasureForwardBackwardAreComputedPerBar() { + var res = 480; // ticks per quarter + var ticksPer96 = res / 24; // 20 + var n0 = MakeNote(0, 1000, 0, 480, 0); + var n1 = MakeNote(1000, 2000, 480, 480, 0); + var n2 = MakeNote(2000, 3000, 960, 480, 0); + var phrase = BuildPhrase(new[] { n0, n1, n2 }, res); + + var e0 = n0.e(); + var e1 = n1.e(); + var e2 = n2.e(); + + // forward index (e10) + Assert.Equal("0", e0[9]); + Assert.Equal("1", e1[9]); + Assert.Equal("2", e2[9]); + // backward index (e11) + Assert.Equal("2", e0[10]); + Assert.Equal("1", e1[10]); + Assert.Equal("0", e2[10]); + + // forward ms in centiseconds (e12) + Assert.Equal("0", e0[11]); + Assert.Equal("10", e1[11]); + Assert.Equal("20", e2[11]); + // backward ms in centiseconds (e13) + Assert.Equal("20", e0[12]); + Assert.Equal("10", e1[12]); + Assert.Equal("0", e2[12]); + + // forward 96th (e14) + Assert.Equal("0", e0[13]); + Assert.Equal((480 / ticksPer96).ToString(), e1[13]); + Assert.Equal((960 / ticksPer96).ToString(), e2[13]); + // backward 96th (e15) + Assert.Equal((960 / ticksPer96).ToString(), e0[14]); + Assert.Equal((480 / ticksPer96).ToString(), e1[14]); + Assert.Equal("0", e2[14]); + + // forward percent (e16) + Assert.Equal("0", e0[15]); + Assert.Equal("33", e1[15]); + Assert.Equal("66", e2[15]); + // backward percent (e17) + Assert.Equal("66", e0[16]); + Assert.Equal("33", e1[16]); + Assert.Equal("0", e2[16]); + } + + [Fact] + public void AccentDistancesForwardBackward() { + var res = 480; + var ticksPer96 = res / 24; // 20 + var n0 = MakeNote(0, 1000, 0, 480, 0, accent: ""); + var n1 = MakeNote(1000, 2000, 480, 480, 0, accent: "A"); + var n2 = MakeNote(2000, 3000, 960, 480, 0, accent: ""); + var n3 = MakeNote(3000, 4000, 1440, 480, 0, accent: "A"); + var phrase = BuildPhrase(new[] { n0, n1, n2, n3 }, res); + + var e0 = n0.e(); + var e1 = n1.e(); + var e2 = n2.e(); + var e3 = n3.e(); + + // For n2 (between accents): distances should be 1 note, 100 cs, 24 (96th) + Assert.Equal("1", e2[28]); // next accent (notes) + Assert.Equal("1", e2[29]); // prev accent (notes) + Assert.Equal("100", e2[30]); // next accent (cs) + Assert.Equal("100", e2[31]); // prev accent (cs) + Assert.Equal((480 / ticksPer96).ToString(), e2[32]); // next (96th) + Assert.Equal((480 / ticksPer96).ToString(), e2[33]); // prev (96th) + + // For n1 (accent): prev distance is 0, next accent is one note away (n2) + Assert.Equal("1", e1[28]); // next accent (n3 via one note n2) + Assert.Equal("0", e1[29]); // prev accent (itself) + Assert.Equal("100", e1[30]); // next accent (cs) + Assert.Equal("0", e1[31]); // prev accent (cs) + } + + [Fact] + public void NoteToPhonemesKeepsSharedNoteTiming() { + var note = new HTSNote( + new[] { "k", "a", "pau" }, + 4, + 4, + 0, + 0, + 0, + 120, + 60, + false, + false, + "JPN", + string.Empty, + 120, + 360, + 0, + 480); + + var htsPhonemes = note.symbols.Select(x => new HTSPhoneme(x, note)).ToArray(); + int prevVowelPos = -1; + foreach (int i in Enumerable.Range(0, htsPhonemes.Length)) { + htsPhonemes[i].position = i + 1; + htsPhonemes[i].position_backward = htsPhonemes.Length - i; + htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); + if (htsPhonemes[i].type == "v") { + prevVowelPos = i; + } else { + if (prevVowelPos > 0) { + htsPhonemes[i].prev_vowel_distance = i - prevVowelPos; + } + } + } + int nextVowelPos = -1; + for (int i = htsPhonemes.Length - 1; i > 0; --i) { + if (htsPhonemes[i].type == "v") { + nextVowelPos = i; + } else { + if (nextVowelPos > 0) { + htsPhonemes[i].next_vowel_distance = nextVowelPos - i; + } + } + } + + Assert.Equal(3, htsPhonemes.Length); + Assert.All(htsPhonemes, phoneme => Assert.Same(note, phoneme.parent)); + Assert.All(htsPhonemes, phoneme => Assert.Equal(120, phoneme.parent.startMs)); + Assert.All(htsPhonemes, phoneme => Assert.Equal(360, phoneme.parent.endMs)); + Assert.Equal(new[] { 1, 2, 3 }, htsPhonemes.Select(phoneme => phoneme.position).ToArray()); + Assert.Equal(new[] { 3, 2, 1 }, htsPhonemes.Select(phoneme => phoneme.position_backward).ToArray()); + Assert.Equal(new[] { "c", "v", "p" }, htsPhonemes.Select(phoneme => phoneme.type).ToArray()); + Assert.Equal(1, htsPhonemes[2].prev_vowel_distance); + } + + [Fact] + public void PhraseResolutionUpdateRecomputesMeasureTicks() { + var note0 = MakeNote(0, 1000, 0, 960, 0); + var note1 = MakeNote(1000, 2000, 960, 960, 0); + var phrase = new HTSPhrase(new[] { note0, note1 }); + note0.parent = phrase; + note1.parent = phrase; + note0.index = 1; + note1.index = 2; + note0.indexBackwards = 2; + note1.indexBackwards = 1; + note0.next = note1; + note1.prev = note0; + note0.sentenceDurMs = 2000; + note1.sentenceDurMs = 2000; + note0.sentenceDurTicks = 1920; + note1.sentenceDurTicks = 1920; + + phrase.UpdateResolution(960); + + var e1 = note1.e(); + Assert.Equal("24", e1[13]); + Assert.Equal("24", e1[21]); + } + + [Fact] + public void RestNoteMasksPitchFields() { + var rest = MakeNote(0, 500, 0, 480, 0); + rest.isRest = true; + rest.tone = 0; + + var phrase = BuildPhrase(new[] { rest }, 480); + var e = rest.e(); + + Assert.Equal("xx", e[0]); + Assert.Equal("xx", e[1]); + Assert.Equal("xx", e[56]); + Assert.Equal("xx", e[57]); + } + + [Fact] + public void PitchDifferenceToRestNeighborsUsesXx() { + var restStart = MakeNote(0, 500, 0, 480, 0); + restStart.isRest = true; + restStart.tone = 0; + var note = MakeNote(500, 1000, 480, 480, 0); + var restEnd = MakeNote(1000, 1500, 960, 480, 0); + restEnd.isRest = true; + restEnd.tone = 0; + + var phrase = BuildPhrase(new[] { restStart, note, restEnd }, 480); + var e = note.e(); + + Assert.Equal("xx", e[56]); + Assert.Equal("xx", e[57]); + } + + [Fact] + public void AlignTimingPositionsFollowsAnchorPoints() { + var durations = new[] { 20d, 10d, 30d }; + var alignPoints = new[] { + Tuple.Create(1, 100d), + Tuple.Create(3, 160d), + }; + + var positions = HTSContextBuilder.AlignTimingPositions(durations, alignPoints); + + Assert.Equal(2, positions.Count); + Assert.Equal(100d, positions[0]); + Assert.Equal(115d, positions[1]); + } + + [Fact] + public void BuildAlignedNoteTimingResultReturnsNoteRelativeTicks() { + var result = HTSContextBuilder.BuildAlignedNoteTimingResult( + new[] { "pau", "a", "b", "c" }, + 1, + 4, + new[] { 80d, 100d, 120d }, + 50d, + (start, end) => (int)Math.Round(end - start)); + + Assert.Equal(3, result.Count); + Assert.Equal(Tuple.Create("a", 30), result[0]); + Assert.Equal(Tuple.Create("b", 50), result[1]); + Assert.Equal(Tuple.Create("c", 70), result[2]); + } + } +} diff --git a/OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs b/OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs new file mode 100644 index 000000000..b8614ad6b --- /dev/null +++ b/OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs @@ -0,0 +1,242 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Reflection; +using System.Text; +using OpenUtau.Api; +using OpenUtau.Classic; +using OpenUtau.Core; +using OpenUtau.Core.Format; +using OpenUtau.Core.Hts; +using OpenUtau.Core.Ustx; +using OpenUtau.Core.Util; +using OpenUtau.Core.Util.nnmnkwii.frontend; +using OpenUtau.Core.Util.nnmnkwii.io.hts; +using Xunit; +using Xunit.Abstractions; + +namespace OpenUtau.Plugins { + // Minimal concrete HTSLabelPhonemizer for testing without external aligners. + class DummyHtsLabelPhonemizer : HTSLabelPhonemizer { + public string GeneratedFullScorePath => fullScorePath; + public string GeneratedMonoTimingPath => monoTimingPath; + public string GeneratedTempPath => htstmpPath; + + public DummyHtsLabelPhonemizer() { + // Minimal language and symbol classes + lang = "JPN"; + vowels = new List { "a", "i", "u", "e", "o" }; + pauses = new List { "pau" }; + silences = new List { "sil" }; + breaks = new List { "br" }; + } + + protected override IG2p LoadG2p(string rootPath) { + // Provide a tiny JP-like dictionary: simple CV mapping. + var builder = G2pDictionary.NewBuilder(); + // vowels + builder.AddSymbol("a", true); + builder.AddSymbol("i", true); + builder.AddSymbol("u", true); + builder.AddSymbol("e", true); + builder.AddSymbol("o", true); + // consonants + var cons = new[] { "k", "s", "t", "n", "h", "m", "y", "r", "w" }; + foreach (var c in cons) builder.AddSymbol(c, false); + // pauses etc + builder.AddSymbol("pau", false); + builder.AddSymbol("sil", false); + builder.AddSymbol("br", false); + // single vowels + builder.AddEntry("a", new[] { "a" }); + builder.AddEntry("i", new[] { "i" }); + builder.AddEntry("u", new[] { "u" }); + builder.AddEntry("e", new[] { "e" }); + builder.AddEntry("o", new[] { "o" }); + // CV (subset) + builder.AddEntry("ka", new[] { "k", "a" }); + builder.AddEntry("ki", new[] { "k", "i" }); + builder.AddEntry("ku", new[] { "k", "u" }); + builder.AddEntry("ke", new[] { "k", "e" }); + builder.AddEntry("ko", new[] { "k", "o" }); + builder.AddEntry("ta", new[] { "t", "a" }); + builder.AddEntry("ti", new[] { "t", "i" }); + builder.AddEntry("to", new[] { "t", "o" }); + builder.AddEntry("na", new[] { "n", "a" }); + builder.AddEntry("ni", new[] { "n", "i" }); + builder.AddEntry("no", new[] { "n", "o" }); + builder.AddEntry("ma", new[] { "m", "a" }); + builder.AddEntry("mi", new[] { "m", "i" }); + builder.AddEntry("mo", new[] { "m", "o" }); + builder.AddEntry("ra", new[] { "r", "a" }); + builder.AddEntry("ri", new[] { "r", "i" }); + builder.AddEntry("ro", new[] { "r", "o" }); + return builder.Build(); + } + + protected override HTSNote CustomHTSNoteContext(HTSNote htsNote, Phonemizer.Note note) { + return htsNote; // no-op + } + + protected override HTSPhoneme[] CustomHTSPhonemeContext(HTSPhoneme[] htsPhonemes, Phonemizer.Note[] notes) { + return htsPhonemes; // no-op + } + + protected override Phonemizer.Note[][] PhraseAdjustments(Phonemizer.Note[][] phrese) { + return phrese; // no-op + } + + protected override void SendScore(Phonemizer.Note[][] phrase) { + // Create a fake mono_timing.lab with uniform 100ms durations for each phoneme in full_score.lab + if (!Directory.Exists(htstmpPath)) { + Directory.CreateDirectory(htstmpPath); + } + int count = 0; + if (File.Exists(fullScorePath)) { + count = File.ReadLines(fullScorePath).Count(); + } + long start = 0; + var lines = new List(count); + for (int i = 0; i < count; i++) { + long end = start + 1_000_000; // 100ms in 100ns units + lines.Add($"{start} {end} a"); + start = end; + } + File.WriteAllLines(monoTimingPath, lines); + } + } + + public class HtsLabelPhonemizerTest : PhonemizerTestBase { + public HtsLabelPhonemizerTest(ITestOutputHelper output) : base(output) { } + + protected override Phonemizer CreatePhonemizer() { + return new DummyHtsLabelPhonemizer(); + } + + [Theory] + [InlineData(new string[] { "a" }, new string[] { "a" })] + [InlineData(new string[] { "a", "i" }, new string[] { "a", "i" })] + [InlineData(new string[] { "a", "+~a", "i" }, new string[] { "a", "i" })] // extension note should not duplicate symbols + // JP CV + [InlineData(new string[] { "ka" }, new string[] { "k", "a" })] + [InlineData(new string[] { "ka", "ki" }, new string[] { "k", "a", "k", "i" })] + [InlineData(new string[] { "ka", "+~a", "ki" }, new string[] { "k", "a", "k", "i" })] + public void BasicHtsPipelineTest(string[] lyrics, string[] aliases) { + SameAltsTonesColorsTest("en_delta0", lyrics, aliases, "", "C4", ""); + } + + [Fact] + public void GeneratedLabelsCanDriveFrontendAndSimpleSynthesis() { + var phonemizer = CreateConfiguredPhonemizer(new[] { "ka", "ki", "ro" }); + + Assert.True(File.Exists(phonemizer.GeneratedFullScorePath)); + Assert.True(File.Exists(phonemizer.GeneratedMonoTimingPath)); + + var questionPath = WriteMinimalQuestionSet(phonemizer.GeneratedTempPath); + var questionSet = hts.load_question_set(questionPath, encoding: Encoding.UTF8); + var fullLabels = hts.load(phonemizer.GeneratedFullScorePath, Encoding.UTF8); + var monoLabels = hts.load(phonemizer.GeneratedMonoTimingPath, Encoding.UTF8); + var features = merlin.linguistic_features(fullLabels, questionSet.Item1, questionSet.Item2); + + Assert.Equal(fullLabels.Count, monoLabels.Count); + Assert.Equal(fullLabels.Count, features.Count); + Assert.All(features, feature => { + Assert.Single(feature); + Assert.Equal(1f, feature[0]); + }); + + var waveform = SynthesizeFromLabels(monoLabels, features, 16000); + + Assert.NotEmpty(waveform); + Assert.Contains(waveform, sample => Math.Abs(sample) > 0.0001f); + } + + DummyHtsLabelPhonemizer CreateConfiguredPhonemizer(string[] lyrics) { + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + var dir = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + var basePath = Path.Join(dir, "Files"); + var file = Path.Join(basePath, "en_delta0", "character.txt"); + + VoicebankLoader.IsTest = true; + var voicebank = new Voicebank() { File = file, BasePath = dir }; + VoicebankLoader.LoadVoicebank(voicebank); + var singer = new ClassicSinger(voicebank); + singer.EnsureLoaded(); + + var project = new UProject(); + Ustx.AddDefaultExpressions(project); + var track = project.tracks[0]; + project.expressions.TryGetValue(Ustx.CLR, out var descriptor); + track.VoiceColorExp = descriptor.Clone(); + var colors = singer.Subbanks.Select(subbank => subbank.Color).ToHashSet(); + track.VoiceColorExp.options = colors.OrderBy(color => color).ToArray(); + track.VoiceColorExp.max = track.VoiceColorExp.options.Length - 1; + + var timeAxis = new TimeAxis(); + timeAxis.BuildSegments(project); + + var phonemizer = new DummyHtsLabelPhonemizer(); + phonemizer.Testing = true; + phonemizer.SetSinger(singer); + phonemizer.SetTiming(timeAxis); + phonemizer.SetUp(BuildGroups(lyrics), project, track); + return phonemizer; + } + + Phonemizer.Note[][] BuildGroups(string[] lyrics) { + var groups = new List(); + int position = 240; + foreach (var lyric in lyrics) { + groups.Add(new[] { + new Phonemizer.Note { + lyric = lyric, + duration = 240, + position = position, + tone = Core.MusicMath.NameToTone("C4"), + phonemeAttributes = new[] { + new Phonemizer.PhonemeAttributes { + index = 0, + consonantStretchRatio = 1, + voiceColor = string.Empty, + } + }, + } + }); + position += 240; + } + return groups.ToArray(); + } + + string WriteMinimalQuestionSet(string directory) { + var questionPath = Path.Combine(directory, "test-minimal.qst"); + File.WriteAllLines(questionPath, new[] { + "QS \"ALL\" {*}", + }); + return questionPath; + } + + float[] SynthesizeFromLabels(HTSLabelFile monoLabels, List> features, int sampleRate) { + Assert.True(monoLabels.Count > 0); + long totalDuration = monoLabels[^1].end_time; + int totalSamples = (int)Math.Ceiling(totalDuration / 10_000_000.0 * sampleRate); + var waveform = new float[totalSamples]; + for (int index = 0; index < monoLabels.Count; index++) { + var label = monoLabels[index]; + Assert.True(label.end_time > label.start_time); + if (index > 0) { + Assert.Equal(monoLabels[index - 1].end_time, label.start_time); + } + int startSample = (int)Math.Round(label.start_time / 10_000_000.0 * sampleRate); + int endSample = Math.Min(totalSamples, (int)Math.Round(label.end_time / 10_000_000.0 * sampleRate)); + float amplitude = 0.05f + 0.05f * features[index].Sum(); + float frequency = 220f + 30f * index; + for (int sample = startSample; sample < endSample; sample++) { + float time = sample / (float)sampleRate; + waveform[sample] = amplitude * (float)Math.Sin(2 * Math.PI * frequency * time); + } + } + return waveform; + } + } +} From 92c1f736c31a052256005cb1aef032a36ee86cdc Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Mon, 18 May 2026 18:21:57 +0900 Subject: [PATCH 02/11] Fix expected values for test assertions The test assertions within the MeasureForwardBackwardAreComputedPerBar method have been fixed. The expected values for the indices e0, e1, and e2 have been modified, and the following items were updated: - Forward Index (e10): Fixed expected values for e0[9], e1[9], and e2[9]. - Backward Index (e11): Fixed expected values for e0[10], e1[10], and e2[10]. - Forward Percent (e16): Fixed expected values for e1[15] and e2[15]. - Backward Percent (e17): Fixed expected values for e0[16] and e1[16]. As a result, the test's expected values have been updated to align with the specifications. --- OpenUtau.Test/Core/Util/HtsSpecTests.cs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/OpenUtau.Test/Core/Util/HtsSpecTests.cs b/OpenUtau.Test/Core/Util/HtsSpecTests.cs index 2b66cb46d..a54682df9 100644 --- a/OpenUtau.Test/Core/Util/HtsSpecTests.cs +++ b/OpenUtau.Test/Core/Util/HtsSpecTests.cs @@ -99,13 +99,13 @@ public void MeasureForwardBackwardAreComputedPerBar() { var e2 = n2.e(); // forward index (e10) - Assert.Equal("0", e0[9]); - Assert.Equal("1", e1[9]); - Assert.Equal("2", e2[9]); + Assert.Equal("1", e0[9]); + Assert.Equal("2", e1[9]); + Assert.Equal("3", e2[9]); // backward index (e11) - Assert.Equal("2", e0[10]); - Assert.Equal("1", e1[10]); - Assert.Equal("0", e2[10]); + Assert.Equal("3", e0[10]); + Assert.Equal("2", e1[10]); + Assert.Equal("1", e2[10]); // forward ms in centiseconds (e12) Assert.Equal("0", e0[11]); @@ -127,11 +127,11 @@ public void MeasureForwardBackwardAreComputedPerBar() { // forward percent (e16) Assert.Equal("0", e0[15]); - Assert.Equal("33", e1[15]); - Assert.Equal("66", e2[15]); + Assert.Equal("50", e1[15]); + Assert.Equal("100", e2[15]); // backward percent (e17) - Assert.Equal("66", e0[16]); - Assert.Equal("33", e1[16]); + Assert.Equal("100", e0[16]); + Assert.Equal("50", e1[16]); Assert.Equal("0", e2[16]); } From ebfe4e49d95bdc04a2f49d0fd3cd99ba787c6319 Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Mon, 18 May 2026 18:51:14 +0900 Subject: [PATCH 03/11] Addressing Copilot AI review comments --- OpenUtau.Core/Hts/HTSLabelPhonemizer.cs | 32 ++++++++++--------- OpenUtau.Core/Hts/HTSLabelRenderer.cs | 9 ++++-- .../EnunuOnnx/EnunuOnnxPhonemizer.cs | 30 +++++++++-------- 3 files changed, 40 insertions(+), 31 deletions(-) diff --git a/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs index d3c7f5f46..892e88d56 100644 --- a/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs +++ b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs @@ -54,8 +54,7 @@ public override void SetSinger(USinger singer) { string rootPath; if (File.Exists(Path.Join(singer.Location, "enunux", "enuconfig.yaml"))) { rootPath = Path.Combine(singer.Location, "enunux"); - } - if (File.Exists(Path.Join(singer.Location, "enuconfig.yaml"))) { + }else if (File.Exists(Path.Join(singer.Location, "enuconfig.yaml"))) { rootPath = Path.Combine(singer.Location, "enunux"); } else { rootPath = singer.Location; @@ -63,7 +62,7 @@ public override void SetSinger(USinger singer) { //Load g2p from enunux.yaml //g2p dict should be load after enunu dict try { - g2p = LoadG2p(singer.Location); + g2p = LoadG2p(rootPath); } catch (Exception e) { Log.Error(e, "failed to load g2p dictionary"); return; @@ -176,7 +175,7 @@ public override void SetUp(Note[][] notes, UProject project, UTrack track) { if (existSymbol) { splitFlag = false; continue; - } else if (existSymbol && !splitFlag) { + } else if (!existSymbol && !splitFlag) { splitFlag = true; continue; } @@ -659,17 +658,20 @@ protected override void ProcessPart(Note[][] phrase) { } public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevNeighbour, Note? nextNeighbour, Note[] prevs) { - if (!partResult.TryGetValue(notes[0].position, out var phonemes)) { - throw new Exception("error"); - } - return new Result { - phonemes = phonemes - .Select((tu) => new Phoneme() { - phoneme = tu.Item1, - position = tu.Item2, - }) - .ToArray(), - }; + if (partResult.TryGetValue(notes[0].position, out var phonemes)) { + return new Result { + phonemes = phonemes + .Select((tu) => new Phoneme() { + phoneme = tu.Item1, + position = tu.Item2, + }) + .ToArray(), + }; + } + if (SetUpException != null) { + throw new Exception("Phonemizer failed to process.", SetUpException); + } + throw new Exception("Part result not found"); } } } diff --git a/OpenUtau.Core/Hts/HTSLabelRenderer.cs b/OpenUtau.Core/Hts/HTSLabelRenderer.cs index 986505de6..26bf021a3 100644 --- a/OpenUtau.Core/Hts/HTSLabelRenderer.cs +++ b/OpenUtau.Core/Hts/HTSLabelRenderer.cs @@ -2,6 +2,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using System.Runtime.CompilerServices; using System.Text; using System.Threading; using System.Threading.Tasks; @@ -66,6 +67,7 @@ public virtual void SetUp() { phoneDict.Add("-", new string[] { "pau" }); phoneDict.Add("SP", new string[] { "pau" }); phoneDict.Add("AP", new string[] { "br" }); + LoadDict(monoScorePath, Encoding.UTF8); g2p = LoadG2p(); } @@ -156,7 +158,7 @@ protected IG2p LoadG2p() { if (existSymbol) { splitFlag = false; continue; - } else if (existSymbol && !splitFlag) { + } else if (!existSymbol && !splitFlag) { splitFlag = true; continue; } @@ -383,11 +385,12 @@ public void ProcessPart(RenderPhrase phrase) { continue; } } + int noteCount = tuples.Count; for (int i = 0; i < tuples.Count; i++) { var htsNote = tuples[i].Item1; htsNotes.Add(htsNote); - htsNote.index = i; - htsNote.indexBackwards = htsNotes.Count - i; + htsNote.index = i + 1; + htsNote.indexBackwards = noteCount - i; htsNote.sentenceDurMs = sentenceDurMs; htsNote.sentenceDurTicks = sentenceDurTicks; var tmpPhonemes = HTSNoteToPhonemes(htsNote); diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs b/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs index d9c664bcf..d3f3a1ccc 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs +++ b/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs @@ -452,31 +452,35 @@ protected virtual HTSNote[] MakeSyllables(Note[] inputNotes, int startTick) { HTSPhoneme[] HTSNoteToPhonemes(HTSNote htsNote) { var htsPhonemes = htsNote.symbols.Select(x => new HTSPhoneme(x, htsNote)).ToArray(); - foreach (int i in Enumerable.Range(0, htsPhonemes.Length)) { + foreach (var i in Enumerable.Range(0, htsPhonemes.Length)) { htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); htsPhonemes[i].position = i + 1; htsPhonemes[i].position_backward = htsPhonemes.Length - i; } - foreach (int i in Enumerable.Range(0, htsPhonemes.Length)) { + foreach (var i in Enumerable.Range(0, htsPhonemes.Length)) { if (htsPhonemes[i].type.Equals("c")) { - int next = i + 1; - if (next < htsPhonemes.Length) { - if (htsPhonemes[next].type.Equals("v")) { - htsPhonemes[i].next_vowel_distance = 1; + var prev = i - 1; + if (prev >= 0) { + if (htsPhonemes[prev].type.Equals("v")) { + htsPhonemes[i].prev_vowel_distance = 1; + } else if (htsPhonemes[prev].prev_vowel_distance > 0) { + htsPhonemes[i].prev_vowel_distance = htsPhonemes[prev].prev_vowel_distance + 1; } else { - htsPhonemes[i].next_vowel_distance = htsPhonemes[next].next_vowel_distance + 1; + htsPhonemes[i].prev_vowel_distance = 0; } } } } - for (int i = htsPhonemes.Length - 1; i > 0; --i) { + for (var i = htsPhonemes.Length - 1; i >= 0; --i) { if (htsPhonemes[i].type.Equals("c")) { - int prev = i - 1; - if (prev >= 0) { - if (htsPhonemes[prev].type.Equals("v")) { - htsPhonemes[i].prev_vowel_distance = 1; + var next = i + 1; + if (next < htsPhonemes.Length) { + if (htsPhonemes[next].type.Equals("v")) { + htsPhonemes[i].next_vowel_distance = 1; + } else if (htsPhonemes[next].next_vowel_distance > 0) { + htsPhonemes[i].next_vowel_distance = htsPhonemes[next].next_vowel_distance + 1; } else { - htsPhonemes[i].prev_vowel_distance = htsPhonemes[prev].prev_vowel_distance + 1; + htsPhonemes[i].next_vowel_distance = 0; } } } From 53cac3ff32cd28663e609bdd479b1c9f0e791457 Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Mon, 18 May 2026 19:37:11 +0900 Subject: [PATCH 04/11] Addressing Copilot AI review comments --- OpenUtau.Core/Util/HTS.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/OpenUtau.Core/Util/HTS.cs b/OpenUtau.Core/Util/HTS.cs index 790f6bf65..cb55436fc 100644 --- a/OpenUtau.Core/Util/HTS.cs +++ b/OpenUtau.Core/Util/HTS.cs @@ -40,8 +40,7 @@ public static string GetToneName(int noteNum) { } public static string GetOctaveNum(int noteNum) { - NameInOctave.TryGetValue(KeysInOctave[noteNum % 12].ToString(), out int num); - return noteNum < 0 ? string.Empty : num.ToString(); + return noteNum < 0 ? string.Empty : (noteNum / 12 - 1).ToString(); } //return -1 if error From 54f77a36da1ec7b26003d87dde13d8620f11861b Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Mon, 18 May 2026 19:55:59 +0900 Subject: [PATCH 05/11] Changing from a design that shares logic in the base class to implementing everything in the subclasses Change HTSLabelRenderer: SetUp method to abstract The SetUp method has been changed from virtual to abstract. As a result, all subclasses are now required to implement SetUp. The original logic within the SetUp method (initialization of phoneDict, language settings, dictionary loading, etc.) has been removed, and these responsibilities are now delegated to the subclasses. --- OpenUtau.Core/Hts/HTSLabelRenderer.cs | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/OpenUtau.Core/Hts/HTSLabelRenderer.cs b/OpenUtau.Core/Hts/HTSLabelRenderer.cs index 26bf021a3..b2d0df3ef 100644 --- a/OpenUtau.Core/Hts/HTSLabelRenderer.cs +++ b/OpenUtau.Core/Hts/HTSLabelRenderer.cs @@ -59,17 +59,7 @@ public abstract class HTSLabelRenderer : IRenderer { protected string monoTimingPath = string.Empty; protected string fullTimingPath = string.Empty; - public virtual void SetUp() { - phoneDict.Clear(); - lang = "JPN";//TODO: use singer.language - // Lyrics often handled in OpenUtau - phoneDict.Add("R", new string[] { "pau" }); - phoneDict.Add("-", new string[] { "pau" }); - phoneDict.Add("SP", new string[] { "pau" }); - phoneDict.Add("AP", new string[] { "br" }); - LoadDict(monoScorePath, Encoding.UTF8); - g2p = LoadG2p(); - } + public abstract void SetUp(); protected virtual void LoadDict(string path, Encoding encoding) { if (path.EndsWith(".conf")) { From 536ccd5fc10ee6b5e77812f43e3af1ca7d5cc8cf Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Wed, 20 May 2026 18:50:41 +0900 Subject: [PATCH 06/11] Address review comments from Copilot --- OpenUtau.Core/Hts/HTSLabelRenderer.cs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/OpenUtau.Core/Hts/HTSLabelRenderer.cs b/OpenUtau.Core/Hts/HTSLabelRenderer.cs index b2d0df3ef..839786e0d 100644 --- a/OpenUtau.Core/Hts/HTSLabelRenderer.cs +++ b/OpenUtau.Core/Hts/HTSLabelRenderer.cs @@ -290,6 +290,17 @@ public void ProcessPart(RenderPhrase phrase) { int startTick = phrase.position; int endTick = phrase.position + phrase.duration; + // パディングを小節長で設定(開始・終了ともに1小節) + var sigStart = timeAxis.TimeSignatureAtTick(startTick); + var bpmStart = timeAxis.GetBpmAtTick(startTick); + var barLenMsStart = (int)Math.Round(60000.0 / bpmStart * sigStart.beatPerBar); + var barLenTicksStart = timeAxis.MsPosToTickPos(barLenMsStart); + + var sigEnd = timeAxis.TimeSignatureAtTick(endTick); + var bpmEnd = timeAxis.GetBpmAtTick(endTick); + var barLenMsEnd = (int)Math.Round(60000.0 / bpmEnd * sigEnd.beatPerBar); + var barLenTicksEnd = timeAxis.MsPosToTickPos(barLenMsEnd); + // 文全体の長さ(開始1小節 + 本体 + 終了1小節) double sentenceDurMs = headMs + phrase.endMs - phrase.positionMs + tailMs; int sentenceDurTicks = barLenTicksStart + (endTick - startTick) + barLenTicksEnd; @@ -440,7 +451,7 @@ public void ProcessPart(RenderPhrase phrase) { File.WriteAllLines(monoTimingPath, monoLabels_.Select(x => x.ToString())); } catch (Exception e) { Log.Error(e.ToString()); - throw e; + throw; } } From 0adbd7bf275820f426388c58ab93f43f55f22107 Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Thu, 21 May 2026 10:25:33 +0900 Subject: [PATCH 07/11] Improvements to Vowel Extension Note Processing and Timing Calculations The conditions for the IsSyllableVowelExtensionNote method have been expanded to recognize lyrics starting with specific symbols as vowel extension notes. Additionally, the calculation of phonemeDuration within the ProcessPart method has been removed, and a logic to directly calculate startMs and endMs has been introduced. In phoneme timing calculations, new logic considering headMs and phrase.positionMs has been added, and a process to adjust the end time of existing monoLabels has been implemented. This prevents overlaps and inconsistencies, improving the accuracy of timing. Furthermore, the startMs of the monoLabel at the end of a phrase has been changed to sentenceDurMs - tailMs to ensure that the timing of the entire phrase is accurately reflected. --- OpenUtau.Core/Hts/HTSLabelRenderer.cs | 41 ++++++++++++++++++--------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/OpenUtau.Core/Hts/HTSLabelRenderer.cs b/OpenUtau.Core/Hts/HTSLabelRenderer.cs index 839786e0d..039f62d1a 100644 --- a/OpenUtau.Core/Hts/HTSLabelRenderer.cs +++ b/OpenUtau.Core/Hts/HTSLabelRenderer.cs @@ -214,7 +214,7 @@ private HTSNote makeHtsNote(string symbol, RenderNote note, int startTick, doubl } protected virtual bool IsSyllableVowelExtensionNote(RenderNote note) { - return note.lyric.StartsWith("+~") || note.lyric.StartsWith("+*"); + return note.lyric.StartsWith("+") || note.lyric.StartsWith("-") || note.lyric.StartsWith("*") || note.lyric.StartsWith("~"); } private string GetPhonemeType(string phoneme) { @@ -311,7 +311,7 @@ public void ProcessPart(RenderPhrase phrase) { List monoLabels_ = new List(); - double phonemeDuration = 0; + //double phonemeDuration = 0; HTSNote PaddingNoteStart = new HTSNote( symbols: new string[] { "pau" }, @@ -337,10 +337,10 @@ public void ProcessPart(RenderPhrase phrase) { monoLabels_.Add(new monoLabel() { symbol = htsPhonemes[0].symbol, - startMs = phonemeDuration, + startMs = 0, endMs = headMs }); - phonemeDuration += headMs; + //phonemeDuration += headMs; //Alignment var phonemesByNoteIndex = phrase.phones @@ -354,12 +354,18 @@ public void ProcessPart(RenderPhrase phrase) { var note = phrase.notes[noteIndex]; if (phonemesByNoteIndex.TryGetValue(noteIndex, out var phonemes)) { foreach (var phone in phonemes) { + var phoneStartMs = headMs + (phone.positionMs - phrase.positionMs); + var phoneEndMs = headMs + (phone.endMs - phrase.positionMs); + var lastMonoLabel = monoLabels_[^1]; + if (phoneStartMs < lastMonoLabel.endMs) { + lastMonoLabel.endMs = phoneStartMs; + monoLabels_[^1] = lastMonoLabel; + } monoLabels_.Add(new monoLabel() { symbol = phone.phoneme, - startMs = phonemeDuration, - endMs = phonemeDuration + phone.durationMs + startMs = phoneStartMs, + endMs = phoneEndMs }); - phonemeDuration += phone.durationMs; } lastBasePhonemes = phonemes; @@ -369,17 +375,24 @@ public void ProcessPart(RenderPhrase phrase) { // 拍点延長ノートは、直前の通常ノートの最後の母音を引き延ばす var extensionPhoneme = FindLastVowelOrLastPhoneme(lastBasePhonemes); if (!string.IsNullOrEmpty(extensionPhoneme.phoneme)) { - var extensionStartMs = note.positionMs - phrase.positionMs + headMs; - var extensionEndMs = note.endMs - phrase.positionMs + headMs; + var htsNote = makeHtsNote(extensionPhoneme.phoneme, note, startTick, headMs); + var extensionStartMs = htsNote.startMs; + var extensionEndMs = htsNote.endMs; + + var lastMonoLabel = monoLabels_[^1]; + if (lastMonoLabel.symbol == extensionPhoneme.phoneme && + lastMonoLabel.startMs < extensionStartMs && + extensionStartMs < lastMonoLabel.endMs) { + lastMonoLabel.endMs = extensionStartMs; + monoLabels_[^1] = lastMonoLabel; + } monoLabels_.Add(new monoLabel() { symbol = extensionPhoneme.phoneme, - startMs = phonemeDuration, - endMs = phonemeDuration + note.durationMs + startMs = extensionStartMs, + endMs = extensionEndMs }); - phonemeDuration += note.durationMs; - HTSNote htsNote = makeHtsNote(extensionPhoneme.phoneme, note, startTick, headMs); tuples.Add(Tuple.Create(htsNote, noteIndex)); } } else { @@ -424,7 +437,7 @@ public void ProcessPart(RenderPhrase phrase) { monoLabels_.Add(new monoLabel() { symbol = htsPhonemes[^1].symbol, - startMs = phonemeDuration, + startMs = sentenceDurMs - tailMs, endMs = sentenceDurMs }); From b97e10292e3e8d228d5715ef7bc25f84ea48f249 Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Thu, 21 May 2026 12:40:50 +0900 Subject: [PATCH 08/11] Improvement of lyric processing and modification of vowel extension note determination conditions In the GetSymbols method of HTSLabelPhonemizer.cs, note.lyric is now passed directly to g2p.Query without converting it to lowercase, making it possible to distinguish between uppercase and lowercase letters. In HTSLabelRenderer.cs, an overload that accepts a single symbol has been added to the makeHtsNote method. In the IsSyllableVowelExtensionNote method, the determination condition for vowel extension notes has been changed to only +~ or +*, making the determination more strict. --- OpenUtau.Core/Hts/HTSLabelPhonemizer.cs | 2 +- OpenUtau.Core/Hts/HTSLabelRenderer.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs index 892e88d56..7d3e26639 100644 --- a/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs +++ b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs @@ -322,7 +322,7 @@ string[] GetSymbols(Note note) { .ToArray(); } // User has not provided hint, query g2p dictionary. - var g2presult = g2p.Query(note.lyric.ToLowerInvariant()); + var g2presult = g2p.Query(note.lyric); if (g2presult != null) { return g2presult; } diff --git a/OpenUtau.Core/Hts/HTSLabelRenderer.cs b/OpenUtau.Core/Hts/HTSLabelRenderer.cs index 039f62d1a..f9579cbe7 100644 --- a/OpenUtau.Core/Hts/HTSLabelRenderer.cs +++ b/OpenUtau.Core/Hts/HTSLabelRenderer.cs @@ -214,7 +214,7 @@ private HTSNote makeHtsNote(string symbol, RenderNote note, int startTick, doubl } protected virtual bool IsSyllableVowelExtensionNote(RenderNote note) { - return note.lyric.StartsWith("+") || note.lyric.StartsWith("-") || note.lyric.StartsWith("*") || note.lyric.StartsWith("~"); + return note.lyric.StartsWith("+~") || note.lyric.StartsWith("+*"); } private string GetPhonemeType(string phoneme) { From 36be00abf2ba3130b4e1172219a6d861f80844e4 Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Mon, 8 Jun 2026 03:21:35 +0900 Subject: [PATCH 09/11] Fixing dictionary loading order and adding tests In HTSLabelPhonemizer.cs, the dictionary loading process within the SetSinger method has been moved before the g2p loading process to clarify dependencies. The following changes were made to HtsLabelPhonemizerTest.cs: - Added the DictionaryLoadedBeforeG2p property to the DummyHtsLabelPhonemizer class. - Set tablePath to "oto.ini" in the constructor. - Overrode the LoadG2p method to verify the dictionary loading state. - Added a new test, DictionaryIsLoadedBeforeG2p, to ensure that the dictionary is loaded before g2p. --- OpenUtau.Core/Hts/HTSLabelPhonemizer.cs | 16 ++++++++-------- OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs | 10 ++++++++++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs index 7d3e26639..19e0bcff3 100644 --- a/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs +++ b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs @@ -59,14 +59,6 @@ public override void SetSinger(USinger singer) { } else { rootPath = singer.Location; } - //Load g2p from enunux.yaml - //g2p dict should be load after enunu dict - try { - g2p = LoadG2p(rootPath); - } catch (Exception e) { - Log.Error(e, "failed to load g2p dictionary"); - return; - } //Load Dictionary var enunuDictPath = Path.Join(rootPath, tablePath); try { @@ -75,6 +67,14 @@ public override void SetSinger(USinger singer) { Log.Error(e, $"failed to load dictionary from {enunuDictPath}"); return; } + //Load g2p from enunux.yaml + //g2p dict should be load after enunu dict + try { + g2p = LoadG2p(rootPath); + } catch (Exception e) { + Log.Error(e, "failed to load g2p dictionary"); + return; + } } protected virtual IG2p LoadG2p(string rootPath) { diff --git a/OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs b/OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs index b8614ad6b..97b9b1893 100644 --- a/OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs +++ b/OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs @@ -22,6 +22,7 @@ class DummyHtsLabelPhonemizer : HTSLabelPhonemizer { public string GeneratedFullScorePath => fullScorePath; public string GeneratedMonoTimingPath => monoTimingPath; public string GeneratedTempPath => htstmpPath; + public bool DictionaryLoadedBeforeG2p { get; private set; } public DummyHtsLabelPhonemizer() { // Minimal language and symbol classes @@ -30,9 +31,11 @@ public DummyHtsLabelPhonemizer() { pauses = new List { "pau" }; silences = new List { "sil" }; breaks = new List { "br" }; + tablePath = "oto.ini"; } protected override IG2p LoadG2p(string rootPath) { + DictionaryLoadedBeforeG2p = phoneDict.Count > 0; // Provide a tiny JP-like dictionary: simple CV mapping. var builder = G2pDictionary.NewBuilder(); // vowels @@ -126,6 +129,13 @@ public void BasicHtsPipelineTest(string[] lyrics, string[] aliases) { SameAltsTonesColorsTest("en_delta0", lyrics, aliases, "", "C4", ""); } + [Fact] + public void DictionaryIsLoadedBeforeG2p() { + var phonemizer = CreateConfiguredPhonemizer(new[] { "a" }); + + Assert.True(phonemizer.DictionaryLoadedBeforeG2p); + } + [Fact] public void GeneratedLabelsCanDriveFrontendAndSimpleSynthesis() { var phonemizer = CreateConfiguredPhonemizer(new[] { "ka", "ki", "ro" }); From 623e85fc2676adceddebacef4df5becd06e8c28f Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Mon, 8 Jun 2026 03:22:24 +0900 Subject: [PATCH 10/11] Remove unused using directives --- OpenUtau.Core/Hts/HTSLabelPhonemizer.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs index 19e0bcff3..880910165 100644 --- a/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs +++ b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs @@ -9,7 +9,6 @@ using OpenUtau.Core.Util; using OpenUtau.Core.Util.nnmnkwii.io.hts; using Serilog; -using static System.Net.Mime.MediaTypeNames; namespace OpenUtau.Core.Hts { public abstract class HTSLabelPhonemizer : MachineLearningPhonemizer { From 623f18382908ac6b2ed3a93203d7add7098c7209 Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Mon, 8 Jun 2026 03:32:06 +0900 Subject: [PATCH 11/11] The processing in HTSNoteToPhonemes was outdated, so I aligned it with the processing of the new Shuri HTSLabelPhonemizer. --- OpenUtau.Core/Hts/HTSLabelRenderer.cs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/OpenUtau.Core/Hts/HTSLabelRenderer.cs b/OpenUtau.Core/Hts/HTSLabelRenderer.cs index f9579cbe7..8e79ff032 100644 --- a/OpenUtau.Core/Hts/HTSLabelRenderer.cs +++ b/OpenUtau.Core/Hts/HTSLabelRenderer.cs @@ -250,20 +250,24 @@ private HTSPhoneme[] HTSNoteToPhonemes(HTSNote htsNote) { if (prev >= 0) { if (htsPhonemes[prev].type.Equals("v")) { htsPhonemes[i].prev_vowel_distance = 1; - } else { + } else if (htsPhonemes[prev].prev_vowel_distance > 0) { htsPhonemes[i].prev_vowel_distance = htsPhonemes[prev].prev_vowel_distance + 1; + } else { + htsPhonemes[i].prev_vowel_distance = 0; } } } } - for (int i = htsPhonemes.Length - 1; i > 0; --i) { + for (int i = htsPhonemes.Length - 1; i >= 0; --i) { if (htsPhonemes[i].type.Equals("c")) { int next = i + 1; if (next < htsPhonemes.Length) { if (htsPhonemes[next].type.Equals("v")) { htsPhonemes[i].next_vowel_distance = 1; - } else { + } else if (htsPhonemes[next].next_vowel_distance > 0) { htsPhonemes[i].next_vowel_distance = htsPhonemes[next].next_vowel_distance + 1; + } else { + htsPhonemes[i].next_vowel_distance = 0; } } }