diff --git a/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs new file mode 100644 index 000000000..880910165 --- /dev/null +++ b/OpenUtau.Core/Hts/HTSLabelPhonemizer.cs @@ -0,0 +1,676 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using K4os.Hash.xxHash; +using OpenUtau.Api; +using OpenUtau.Core.Ustx; +using OpenUtau.Core.Util; +using OpenUtau.Core.Util.nnmnkwii.io.hts; +using Serilog; + +namespace OpenUtau.Core.Hts { + public abstract class HTSLabelPhonemizer : MachineLearningPhonemizer { + protected USinger singer; + //information used by HTS writer + protected Dictionary phoneDict = new Dictionary(); + protected List vowels = new List(); + protected List consonants = new List(); + protected List breaks = new List(); + protected List pauses = new List(); + protected List silences = new List(); + protected List unvoiced = new List(); + protected string lang = ""; + int key = 0; + int resolution = 480; + + //information used by openutau phonemizer + protected IG2p g2p; + //result caching + private Dictionary>> partResult = new Dictionary>>(); + + protected string tmpPath = string.Empty; + protected string tablePath = string.Empty; + protected string questionPath = string.Empty; + protected string htstmpPath = string.Empty; + protected string monoScorePath = string.Empty; + protected string fullScorePath = string.Empty; + protected string monoTimingPath = string.Empty; + protected string fullTimingPath = string.Empty; + + public HTSLabelPhonemizer() { + + } + + public override void SetSinger(USinger singer) { + this.singer = singer; + if (singer == null) { + return; + } + phoneDict.Clear(); + //Load enuconfig + string rootPath; + if (File.Exists(Path.Join(singer.Location, "enunux", "enuconfig.yaml"))) { + rootPath = Path.Combine(singer.Location, "enunux"); + }else if (File.Exists(Path.Join(singer.Location, "enuconfig.yaml"))) { + rootPath = Path.Combine(singer.Location, "enunux"); + } else { + rootPath = singer.Location; + } + //Load Dictionary + var enunuDictPath = Path.Join(rootPath, tablePath); + try { + LoadDict(Path.Join(rootPath, tablePath), singer.TextFileEncoding); + } catch (Exception e) { + Log.Error(e, $"failed to load dictionary from {enunuDictPath}"); + return; + } + //Load g2p from enunux.yaml + //g2p dict should be load after enunu dict + try { + g2p = LoadG2p(rootPath); + } catch (Exception e) { + Log.Error(e, "failed to load g2p dictionary"); + return; + } + } + + protected virtual IG2p LoadG2p(string rootPath) { + var g2ps = new List(); + + var enunuxPath = Path.Combine(rootPath, "enunux.yaml"); + var builder = G2pDictionary.NewBuilder(); + // Load dictionary from enunux.yaml and nnsvs dict + if (File.Exists(enunuxPath)) { + try { + var input = File.ReadAllText(enunuxPath, singer.TextFileEncoding); + var data = Yaml.DefaultDeserializer.Deserialize(input); + if (data.symbols != null) { + foreach (var symbolData in data.symbols) { + builder.AddSymbol(symbolData.symbol, symbolData.type); + } + } + foreach (var grapheme in phoneDict.Keys) { + builder.AddEntry(grapheme, phoneDict[grapheme]); + } + if (data.entries != null) { + foreach (var entry in data.entries) { + builder.AddEntry(entry.grapheme, entry.phonemes); + } + } + } catch (Exception e) { + Log.Error(e, $"Failed to load Dictionary"); + } + } + foreach (var entry in phoneDict.Keys) { + builder.AddEntry(entry, phoneDict[entry]); + } + g2ps.Add(builder.Build()); + return new G2pFallbacks(g2ps.ToArray()); + } + + public void LoadDict(string path, Encoding encoding) { + if (path.EndsWith(".conf")) { + LoadConf(path, encoding); + } else { + LoadTable(path, encoding); + } + } + + public void LoadTable(string path, Encoding encoding) { + var lines = File.ReadLines(path, encoding); + foreach (var line in lines) { + var lineSplit = line.Split(); + phoneDict[lineSplit[0]] = lineSplit[1..]; + } + } + + public void LoadConf(string path, Encoding encoding) { + phoneDict["SILENCES"] = new string[] { "sil" }; + phoneDict["PAUSES"] = new string[] { "pau" }; + phoneDict["BREAK"] = new string[] { "br" }; + var lines = File.ReadLines(path, encoding); + foreach (var line in lines) { + if (line.Contains('=')) { + var lineSplit = line.Split("="); + var key = lineSplit[0]; + var value = lineSplit[1]; + var phonemes = value.Trim(new char[] { '\"' }).Split(","); + phoneDict[key] = phonemes; + } + } + } + + public override void SetUp(Note[][] notes, UProject project, UTrack track) { + key = project.key; + resolution = project.resolution; + //将全曲拆分为句子 + var phrase = new List { notes[0] }; + for (var i = 1; i < notes.Length; ++i) { + //如果上下音符相互衔接,则不分句 + if (notes[i - 1][^1].position + notes[i - 1][^1].duration == notes[i][0].position) { + phrase.Add(notes[i]); + } else { + //如果断开了,则处理当前句子,并开启下一句 + ProcessPart(phrase.ToArray()); + phrase.Clear(); + phrase.Add(notes[i]); + } + } + if (phrase.Count > 0) { + ProcessPart(phrase.ToArray()); + } + } + + protected (string prefix, string suffix) GetPrefixAndSuffix(Note note) { + var prefix = string.Empty; + var suffix = string.Empty; + + var textList = note.lyric.Split().ToList(); + var splitFlag = true; + foreach (var text in textList) { + var existSymbol = g2p.IsValidSymbol(text); + if (existSymbol) { + splitFlag = false; + continue; + } else if (!existSymbol && !splitFlag) { + splitFlag = true; + continue; + } + if (splitFlag) { + prefix += text; + } else { + suffix += text; + } + } + + return (prefix, suffix); + } + + protected abstract HTSNote CustomHTSNoteContext(HTSNote htsNote, Note note); + + //make a HTS Note from given symbols and UNotes + //TODO:Fix the processing for rests + protected HTSNote makeHtsNote(string[] symbols, IList group, int startTick) { + var htsNote = HTSContextBuilder.BuildNote( + symbols, + group[0].tone, + IsSyllableVowelExtensionNote(group[0]), + lang, + key, + timeAxis, + group[0].position, + group[^1].position + group[^1].duration, + startTick, + 0, + symbol => pauses.Contains(symbol) || silences.Contains(symbol) || breaks.Contains(symbol)); + return CustomHTSNoteContext(htsNote, group[0]) ?? htsNote; + } + + protected HTSNote makeHtsNote(string symbol, Note[] group, int startTick) { + return makeHtsNote(new string[] { symbol }, group, startTick); + } + + protected bool IsSyllableVowelExtensionNote(Note note) { + return note.lyric.StartsWith("+~") || note.lyric.StartsWith("+*"); + } + + private string[] ApplyExtensions(string[] symbols, Note[] notes) { + var newSymbols = new List(); + var vowelIds = ExtractVowels(symbols); + if (vowelIds.Count == 0) { + // no syllables or all consonants, the last phoneme will be interpreted as vowel + vowelIds.Add(symbols.Length - 1); + } + var lastVowelI = 0; + newSymbols.AddRange(symbols.Take(vowelIds[lastVowelI] + 1)); + for (var i = 1; i < notes.Length && lastVowelI + 1 < vowelIds.Count; i++) { + if (!IsSyllableVowelExtensionNote(notes[i])) { + var prevVowel = vowelIds[lastVowelI]; + lastVowelI++; + var vowel = vowelIds[lastVowelI]; + newSymbols.AddRange(symbols.Skip(prevVowel + 1).Take(vowel - prevVowel)); + } else { + newSymbols.Add(symbols[vowelIds[lastVowelI]]); + } + } + newSymbols.AddRange(symbols.Skip(vowelIds[lastVowelI] + 1)); + return newSymbols.ToArray(); + } + + private List ExtractVowels(string[] symbols) { + var vowelIds = new List(); + for (var i = 0; i < symbols.Length; i++) { + if (g2p.IsVowel(symbols[i])) { + vowelIds.Add(i); + } + } + return vowelIds; + } + + protected virtual Note[] HandleNotEnoughNotes(Note[] notes, List vowelIds) { + var newNotes = new List(); + newNotes.AddRange(notes.SkipLast(1)); + var lastNote = notes.Last(); + var position = lastNote.position; + var notesToSplit = vowelIds.Count - newNotes.Count; + var duration = lastNote.duration / notesToSplit / 15 * 15; + for (var i = 0; i < notesToSplit; i++) { + var durationFinal = i != notesToSplit - 1 ? duration : lastNote.duration - duration * (notesToSplit - 1); + newNotes.Add(new Note() { + position = position, + duration = durationFinal, + tone = lastNote.tone, + phonemeAttributes = lastNote.phonemeAttributes + }); + position += durationFinal; + } + + return newNotes.ToArray(); + } + + protected virtual Note[] HandleExcessNotes(Note[] notes, List vowelIds) { + var newNotes = new List(); + var SyllableCount = vowelIds.Count; + newNotes.AddRange(notes.Take(SyllableCount - 1)); + var lastNote = notes[SyllableCount - 1]; + newNotes.Add(new Note() { + lyric = lastNote.lyric, + phoneticHint = lastNote.phoneticHint, + position = lastNote.position, + duration = notes[(SyllableCount - 1)..].Select(note => note.duration).Sum(), + tone = lastNote.tone, + phonemeAttributes = lastNote.phonemeAttributes + }); + return newNotes.ToArray(); + } + + public string GetPhonemeType(string phoneme) { + if (phoneme == "xx") { + return "xx"; + } + if (vowels.Contains(phoneme)) { + return "v"; + } + if (pauses.Contains(phoneme)) { + return "p"; + } + if (silences.Contains(phoneme)) { + return "s"; + } + if (breaks.Contains(phoneme)) { + return "b"; + } + //if (unvoiced.Contains(phoneme)) { + // return "c"; + //} + return "c"; + } + + string[] GetSymbols(Note note) { + //priority: + //1. phonetic hint + //2. query from g2p dictionary + //3. treat lyric as phonetic hint, including single phoneme + //4. default pause + if (!string.IsNullOrEmpty(note.phoneticHint)) { + // Split space-separated symbols into an array. + return note.phoneticHint.Split() + .Where(s => g2p.IsValidSymbol(s)) // skip the invalid symbols. + .ToArray(); + } + // User has not provided hint, query g2p dictionary. + var g2presult = g2p.Query(note.lyric); + if (g2presult != null) { + return g2presult; + } + //not founded in g2p dictionary, treat lyric as phonetic hint + var lyricSplited = note.lyric.Split() + .Where(s => g2p.IsValidSymbol(s)) // skip the invalid symbols. + .ToArray(); + if (lyricSplited.Length > 0) { + return lyricSplited; + } + return new string[] { "pau" }; + } + + private (string[], int[], Note[]) GetSymbolsAndVowels(Note[] notes) { + var mainNote = notes[0]; + var symbols = GetSymbols(mainNote); + if (symbols == null) { + return (null, null, null); + } + if (symbols.Length == 0) { + symbols = new string[] { "" }; + } + symbols = ApplyExtensions(symbols, notes); + var vowelIds = ExtractVowels(symbols); + if (vowelIds.Count == 0) { + // no syllables or all consonants, the last phoneme will be interpreted as vowel + vowelIds.Add(symbols.Length - 1); + } + if (notes.Length < vowelIds.Count) { + notes = HandleNotEnoughNotes(notes, vowelIds); + } else if (notes.Length > vowelIds.Count) { + notes = HandleExcessNotes(notes, vowelIds); + } + return (symbols, vowelIds.ToArray(), notes); + } + + protected struct Syllable { + public List symbols; + public List notes; + } + + protected virtual HTSNote[] MakeSyllables(Note[] inputNotes, int startTick) { + (var symbols, var vowelIds, var notes) = GetSymbolsAndVowels(inputNotes); + if (symbols == null || vowelIds == null || notes == null) { + return null; + } + var firstVowelId = vowelIds[0]; + if (notes.Length < vowelIds.Length) { + //error = $"Not enough extension notes, {vowelIds.Length - notes.Length} more expected"; + return null; + } + + var syllables = new Syllable[vowelIds.Length]; + + // Making the first syllable + + // there is only empty space before us + syllables[0] = new Syllable() { + symbols = symbols.Take(firstVowelId + 1).ToList(), + notes = notes[0..1].ToList() + }; + + // normal syllables after the first one + var noteI = 1; + var ccs = new List(); + var position = 0; + var lastSymbolI = firstVowelId + 1; + for (; lastSymbolI < symbols.Length; lastSymbolI++) { + if (!vowelIds.Contains(lastSymbolI)) { + ccs.Add(symbols[lastSymbolI]); + } else { + position += notes[noteI - 1].duration; + syllables[noteI] = new Syllable() { + symbols = ccs.Append(symbols[lastSymbolI]).ToList(), + notes = new List() { notes[noteI] } + }; + ccs = new List(); + noteI++; + } + } + syllables[^1].symbols.AddRange(ccs); + return syllables.Select(x => makeHtsNote(x.symbols.ToArray(), x.notes, startTick)).ToArray(); + } + + HTSPhoneme[] HTSNoteToPhonemes(HTSNote htsNote) { + var htsPhonemes = htsNote.symbols.Select(x => new HTSPhoneme(x, htsNote)).ToArray(); + // 音節内の音素に対して、タイプ(母音/子音/休符など)や位置情報を付与 + foreach (var i in Enumerable.Range(0, htsPhonemes.Length)) { + htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); + htsPhonemes[i].position = i + 1; + htsPhonemes[i].position_backward = htsPhonemes.Length - i; + } + foreach (var i in Enumerable.Range(0, htsPhonemes.Length)) { + if (htsPhonemes[i].type.Equals("c")) { + var prev = i - 1; + if (prev >= 0) { + if (htsPhonemes[prev].type.Equals("v")) { + htsPhonemes[i].prev_vowel_distance = 1; + } else if (htsPhonemes[prev].prev_vowel_distance > 0) { + htsPhonemes[i].prev_vowel_distance = htsPhonemes[prev].prev_vowel_distance + 1; + } else { + htsPhonemes[i].prev_vowel_distance = 0; + } + } + } + } + for (var i = htsPhonemes.Length - 1; i >= 0; --i) { + if (htsPhonemes[i].type.Equals("c")) { + var next = i + 1; + if (next < htsPhonemes.Length) { + if (htsPhonemes[next].type.Equals("v")) { + htsPhonemes[i].next_vowel_distance = 1; + } else if (htsPhonemes[next].next_vowel_distance > 0) { + htsPhonemes[i].next_vowel_distance = htsPhonemes[next].next_vowel_distance + 1; + } else { + htsPhonemes[i].next_vowel_distance = 0; + } + } + } + } + return htsPhonemes; + } + + protected abstract void SendScore(Note[][] phrase); + + ulong HashPhraseGroups(Note[][] phrase) { + using (var stream = new MemoryStream()) { + using (var writer = new BinaryWriter(stream)) { + writer.Write(phrase.ToString()); + foreach (var phone in phrase) { + writer.Write(phone[0].lyric); + if (phone[0].phoneticHint != null) { + writer.Write("[" + phone[0].phoneticHint + "]"); + } + var attr = phone[0].phonemeAttributes?.FirstOrDefault(attr => attr.index == 0) ?? default; + writer.Write(attr.toneShift); + writer.Write(phone[0].position); + writer.Write(phone[0].duration); + } + return XXH64.DigestOf(stream.ToArray()); + } + } + } + + protected abstract Note[][] PhraseAdjustments(Note[][] phrese); + + protected abstract HTSPhoneme[] CustomHTSPhonemeContext(HTSPhoneme[] htsPhonemes, Note[] notes); + + + protected override void ProcessPart(Note[][] phrase) { + tmpPath = Path.Join(PathManager.Inst.CachePath, $"lab-{HashPhraseGroups(phrase):x16}"); + htstmpPath = tmpPath + "_htstemp"; + fullScorePath = Path.Join(htstmpPath, $"full_score.lab"); + fullTimingPath = Path.Join(htstmpPath, $"full_timing.lab"); + monoScorePath = Path.Join(htstmpPath, $"mono_score.lab"); + monoTimingPath = Path.Join(htstmpPath, $"mono_timing.lab"); + + phrase = PhraseAdjustments(phrase) ?? phrase; + + var startTick = phrase[0][0].position; + var endTick = phrase[^1][^1].position + phrase[^1][^1].duration; + + // パディングを小節長で設定(開始・終了ともに1小節) + var sigStart = timeAxis.TimeSignatureAtTick(startTick); + var bpmStart = timeAxis.GetBpmAtTick(startTick); + var barLenMsStart = (int)Math.Round(60000.0 / bpmStart * sigStart.beatPerBar); + var barLenTicksStart = timeAxis.MsPosToTickPos(barLenMsStart); + + var sigEnd = timeAxis.TimeSignatureAtTick(endTick); + var bpmEnd = timeAxis.GetBpmAtTick(endTick); + var barLenMsEnd = (int)Math.Round(60000.0 / bpmEnd * sigEnd.beatPerBar); + var barLenTicksEnd = timeAxis.MsPosToTickPos(barLenMsEnd); + + // 文全体の長さ(開始1小節 + 本体 + 終了1小節) + var sentenceDurMs = barLenMsStart + (int)timeAxis.MsBetweenTickPos(startTick, endTick) + barLenMsEnd; + var sentenceDurTicks = barLenTicksStart + (endTick - startTick) + barLenTicksEnd; + + var notePhIndex = new List { 1 }; // 先頭パディング分 + var phAlignPoints = new List>(); + + // 先頭パディング pau + timeAxis.TickPosToBarBeat(startTick - barLenTicksStart, out var barStart, out var beatStart, out var _); + var sigForPadStart = timeAxis.TimeSignatureAtTick(startTick - barLenTicksStart); + var PaddingNoteStart = new HTSNote( + symbols: new string[] { "pau" }, + beatPerBar: sigForPadStart.beatPerBar, + beatUnit: sigForPadStart.beatUnit, + positionBar: barStart, + positionBeat: beatStart, + key: key, + bpm: timeAxis.GetBpmAtTick(startTick - barLenTicksStart), + tone: 0, + isSlur: false, + isRest: true, + lang: string.Empty, + accent: string.Empty, + startms: 0, + endms: barLenMsStart, + positionTicks: startTick - barLenTicksStart, + durationTicks: barLenTicksStart + ); + var htsNotes = new List { PaddingNoteStart }; + var htsPhonemes = new List(); + htsPhonemes.AddRange(CustomHTSPhonemeContext(HTSNoteToPhonemes(PaddingNoteStart), phrase[0])); + + // 楽譜ノート → HTSノート + for (var n = 0; n < phrase.Length; ++n) { + var Syllables = MakeSyllables(phrase[n], startTick); + // 各ノートの start/end を「開始パディング加算」ベースに + foreach (var note in Syllables) { + note.startMs += barLenMsStart; + note.endMs += barLenMsStart; + } + htsNotes.AddRange(Syllables); + + for (var noteIndex = 0; noteIndex < Syllables.Length; noteIndex++) { + var htsNote = Syllables[noteIndex]; + var tmpPhonemes = HTSNoteToPhonemes(htsNote); + var notePhonemes = CustomHTSPhonemeContext(tmpPhonemes, phrase[n]) ?? tmpPhonemes; + + // 第1母音位置をアンカーに(絶対ms) + var firstVowelIndex = 0; + for (var phIndex = 0; phIndex < htsNote.symbols.Length; phIndex++) { + if (g2p.IsVowel(htsNote.symbols[phIndex])) { + firstVowelIndex = phIndex; + break; + } + } + phAlignPoints.Add(Tuple.Create( + htsPhonemes.Count + firstVowelIndex, + timeAxis.TickPosToMsPos(htsNote.positionTicks) + barLenMsStart + )); + htsPhonemes.AddRange(notePhonemes); + } + notePhIndex.Add(htsPhonemes.Count); + } + + // 終端パディング pau(位置は「本当の曲末」tick) + timeAxis.TickPosToBarBeat(endTick, out var barEnd, out var beatEnd, out var _); + var PaddingNoteEnd = new HTSNote( + symbols: new string[] { "pau" }, + beatPerBar: sigEnd.beatPerBar, + beatUnit: sigEnd.beatUnit, + positionBar: barEnd, + positionBeat: beatEnd, + key: key, + bpm: bpmEnd, + tone: 0, + isSlur: false, + isRest: true, + lang: string.Empty, + accent: string.Empty, + // 絶対msで末尾に配置 + startms: sentenceDurMs - barLenMsEnd, + endms: sentenceDurMs, + positionTicks: endTick, + durationTicks: barLenTicksEnd + ); + htsNotes.Add(PaddingNoteEnd); + htsPhonemes.AddRange(CustomHTSPhonemeContext(HTSNoteToPhonemes(PaddingNoteEnd), phrase[^1])); + + // 末尾アンカーは「曲末+終端パディング」位置 + var lastNote = htsNotes[^1]; + phAlignPoints.Add(Tuple.Create( + htsPhonemes.Count, + timeAxis.TickPosToMsPos(lastNote.positionTicks + lastNote.durationTicks) + barLenMsStart // = sentenceDurMs + )); + var htsPhrase = new HTSPhrase(htsNotes.ToArray()); + htsPhrase.UpdateResolution(resolution); + htsPhrase.totalNotes = htsNotes.Count - 2; + htsPhrase.totalPhonemes = htsPhonemes.Count - 3; + htsPhrase.totalPhrases = 1; + //make neighborhood links between htsNotes and between htsPhonemes + foreach (var i in Enumerable.Range(0, htsNotes.Count)) { + htsNotes[i].parent = htsPhrase; + htsNotes[i].index = i; + htsNotes[i].indexBackwards = htsNotes.Count - i - 1; + htsNotes[i].sentenceDurMs = sentenceDurMs; + htsNotes[i].sentenceDurTicks = sentenceDurTicks; + if (i > 0) { + htsNotes[i].prev = htsNotes[i - 1]; + htsNotes[i - 1].next = htsNotes[i]; + } + } + for (var i = 1; i < htsPhonemes.Count; ++i) { + htsPhonemes[i].prev = htsPhonemes[i - 1]; + htsPhonemes[i - 1].next = htsPhonemes[i]; + } + + try { + if (!Directory.Exists(htstmpPath)) { + Directory.CreateDirectory(htstmpPath); + } + File.WriteAllLines(fullScorePath, htsPhonemes.Select(x => x.dump())); + } catch (Exception e) { + Log.Error(e.ToString()); + throw; + } + + SendScore(phrase); + if (!File.Exists(monoTimingPath)) { + Log.Error($"File not found.:{monoTimingPath}"); + return; + } + + var hTSLabels = hts.load(monoTimingPath, Encoding.UTF8); + + // 100ns -> ms は 10000 で割る + var labPositions = + hTSLabels.Skip(1).SkipLast(1).Select(label => (label.end_time - label.start_time) / 10000.0).ToList(); + labPositions.Insert(0, labPositions[0]); + labPositions.Add(labPositions[^1]); + + var positions = HTSContextBuilder.AlignTimingPositions(labPositions, phAlignPoints); + + // 出力(略) + var phonemesRedirected = htsPhonemes.Select(x => x.symbol).ToArray(); + for (var groupIndex = 0; groupIndex < phrase.Length; groupIndex++) { + var group = phrase[groupIndex]; + if (group[0].lyric.StartsWith("+")) { + continue; + } + var notePos = timeAxis.TickPosToMsPos(group[0].position) + barLenMsStart; // ms + var noteResult = HTSContextBuilder.BuildAlignedNoteTimingResult( + phonemesRedirected, + notePhIndex[groupIndex], + notePhIndex[groupIndex + 1], + positions, + notePos, + timeAxis.TicksBetweenMsPos); + partResult[group[0].position] = noteResult; + } + } + + public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevNeighbour, Note? nextNeighbour, Note[] prevs) { + if (partResult.TryGetValue(notes[0].position, out var phonemes)) { + return new Result { + phonemes = phonemes + .Select((tu) => new Phoneme() { + phoneme = tu.Item1, + position = tu.Item2, + }) + .ToArray(), + }; + } + if (SetUpException != null) { + throw new Exception("Phonemizer failed to process.", SetUpException); + } + throw new Exception("Part result not found"); + } + } +} diff --git a/OpenUtau.Core/Hts/HTSLabelRenderer.cs b/OpenUtau.Core/Hts/HTSLabelRenderer.cs new file mode 100644 index 000000000..8e79ff032 --- /dev/null +++ b/OpenUtau.Core/Hts/HTSLabelRenderer.cs @@ -0,0 +1,505 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using OpenUtau.Api; +using OpenUtau.Core.Render; +using OpenUtau.Core.Ustx; +using OpenUtau.Core.Util; +using Serilog; + +namespace OpenUtau.Core.Hts { + public abstract class HTSLabelRenderer : IRenderer { + + static readonly object lockObj = new object(); + + public virtual bool SupportsRenderPitch => true; + + public abstract USingerType SingerType { get; } + + public abstract bool SupportsExpression(UExpressionDescriptor descriptor); + + protected TimeAxis timeAxis; + + //information used by HTS writer + protected Dictionary phoneDict = new Dictionary(); + protected List vowels = new List(); + protected List consonants = new List(); + protected List breaks = new List(); + protected List pauses = new List(); + protected List silences = new List(); + protected List unvoiced = new List(); + protected List macronLyrics = new List(); + protected int startTick; + protected int endTick; + protected UTimeSignature sigStart; + protected double bpmStart; + protected double headMs; + protected int barLenTicksStart; + protected UTimeSignature sigEnd; + protected double bpmEnd; + protected double tailMs; + protected int barLenTicksEnd; + protected string lang = ""; + protected int key = 0; + protected int resolution = 480; + protected int framePeriod = 5; + + //information used by openutau phonemizer + protected IG2p g2p; + //result caching + private Dictionary>> partResult = new Dictionary>>(); + protected string tablePath = string.Empty; + protected string monoScorePath = string.Empty; + protected string fullScorePath = string.Empty; + protected string monoTimingPath = string.Empty; + protected string fullTimingPath = string.Empty; + + public abstract void SetUp(); + + protected virtual void LoadDict(string path, Encoding encoding) { + if (path.EndsWith(".conf")) { + LoadConf(path, encoding); + } else { + LoadTable(path, encoding); + } + } + + private void LoadTable(string path, Encoding encoding) { + var lines = File.ReadLines(path, encoding); + foreach (var line in lines) { + var lineSplit = line.Split(); + phoneDict[lineSplit[0]] = lineSplit[1..]; + } + } + + private void LoadConf(string path, Encoding encoding) { + phoneDict["SILENCES"] = new string[] { "sil" }; + phoneDict["PAUSES"] = new string[] { "pau" }; + phoneDict["BREAK"] = new string[] { "br" }; + var lines = File.ReadLines(path, encoding); + foreach (var line in lines) { + if (line.Contains('=')) { + var lineSplit = line.Split("="); + var key = lineSplit[0]; + var value = lineSplit[1]; + var phonemes = value.Trim(new char[] { '\"' }).Split(","); + phoneDict[key] = phonemes; + } + } + } + protected IG2p LoadG2p() { + var g2ps = new List(); + var builder = G2pDictionary.NewBuilder(); + vowels.AddRange(phoneDict["VOWELS"]); + breaks.AddRange(phoneDict["BREAK"]); + pauses.AddRange(phoneDict["PAUSES"]); + silences.AddRange(phoneDict["SILENCES"]); + consonants.AddRange(phoneDict["PHONEME_CL"]); + macronLyrics.AddRange(phoneDict["MACRON"]); + foreach (var dict in phoneDict.Values) { + foreach (var phoneme in dict) { + if (!consonants.Contains(phoneme) && !vowels.Contains(phoneme) && + !breaks.Contains(phoneme) && !pauses.Contains(phoneme) && + !silences.Contains(phoneme)) { + consonants.Add(phoneme); + } + if (!consonants.Contains(phoneme)) { + builder.AddSymbol(phoneme, true); + } else { + builder.AddSymbol(phoneme, false); + } + } + } + foreach (var entry in phoneDict.Keys) { + builder.AddEntry(entry, phoneDict[entry]); + foreach (var reduction in phoneDict["VOWEL_REDUCTION"]) { + var phonemes = phoneDict[entry].Except(vowels).ToList(); + if (phonemes.Count == 0) continue; + builder.AddEntry(entry + reduction, phonemes); + } + foreach (var macron in phoneDict["MACRON"]) { + var addPhonemes = phoneDict[entry].Where(x => vowels.Contains(x)).ToList(); + if (addPhonemes.Count == 0) continue; + var phonemes = phoneDict[entry].ToList(); + phonemes.AddRange(addPhonemes); + builder.AddEntry(entry + macron, phonemes); + macronLyrics.Add(entry + macron); + } + } + g2ps.Add(builder.Build()); + return new G2pFallbacks(g2ps.ToArray()); + } + + + + protected (string prefix, string suffix) GetPrefixAndSuffix(RenderNote note) { + string prefix = string.Empty; + string suffix = string.Empty; + + var textList = note.lyric.Split().ToList(); + bool splitFlag = true; + foreach (var text in textList) { + var existSymbol = g2p.IsValidSymbol(text); + if (existSymbol) { + splitFlag = false; + continue; + } else if (!existSymbol && !splitFlag) { + splitFlag = true; + continue; + } + if (splitFlag) { + prefix += text; + } else { + suffix += text; + } + } + + return (prefix, suffix); + } + + private RenderPhone FindLastVowelOrLastPhoneme(RenderPhone[] phonemes) { + for (int i = phonemes.Length - 1; i >= 0; --i) { + if (g2p.IsVowel(phonemes[i].phoneme)) { + return phonemes[i]; + } + } + return phonemes[^1]; + } + + protected virtual HTSNote CustomHTSNoteContext(HTSNote htsNote, RenderNote note) { + var fixs = GetPrefixAndSuffix(note); + if (!htsNote.isRest && !htsNote.isSlur) { + htsNote.langDependent = "0"; // no macron + if (macronLyrics.Contains(note.lyric)) { + htsNote.langDependent = "1"; // macron + } + } + return htsNote; + } + + //make a HTS Note from given symbols and UNotes + private HTSNote makeHtsNote(string[] symbols, RenderNote note, int startTick, double leadingMs) { + var positiontick = startTick + note.position; + var endTick = positiontick + note.duration; + UTimeSignature sig = timeAxis.TimeSignatureAtTick(positiontick); + timeAxis.TickPosToBarBeat(positiontick, out int bar, out int beat, out int remainingTicks); + var isRest = symbols.Select(x => x.ToLowerInvariant()).Any(x => pauses.Contains(x) || silences.Contains(x) || breaks.Contains(x)); + var htsNote = new HTSNote( + symbols: symbols, + tone: note.tone, + isSlur: IsSyllableVowelExtensionNote(note), + isRest: isRest, + lang: isRest ? string.Empty : lang, + accent: string.Empty, + beatPerBar: sig.beatPerBar, + beatUnit: sig.beatUnit, + positionBar: bar, + positionBeat: beat, + key: key, + bpm: timeAxis.GetBpmAtTick(positiontick), + startms: timeAxis.MsBetweenTickPos(startTick, positiontick) + leadingMs, + endms: timeAxis.MsBetweenTickPos(startTick, endTick) + leadingMs, + positionTicks: positiontick, + durationTicks: note.duration + ); + return CustomHTSNoteContext(htsNote, note) ?? htsNote; + } + private HTSNote makeHtsNote(string symbol, RenderNote note, int startTick, double leadingMs) { + return makeHtsNote(new string[] { symbol }, note, startTick, leadingMs); + } + + protected virtual bool IsSyllableVowelExtensionNote(RenderNote note) { + return note.lyric.StartsWith("+~") || note.lyric.StartsWith("+*"); + } + + private string GetPhonemeType(string phoneme) { + if (phoneme == "xx") { + return "xx"; + } + if (vowels.Contains(phoneme)) { + return "v"; + } + if (pauses.Contains(phoneme)) { + return "p"; + } + if (silences.Contains(phoneme)) { + return "s"; + } + if (breaks.Contains(phoneme)) { + return "b"; + } + //if (unvoiced.Contains(phoneme)) { + // return "c"; + //} + return "c"; + } + + private HTSPhoneme[] HTSNoteToPhonemes(HTSNote htsNote) { + var htsPhonemes = htsNote.symbols.Select(x => new HTSPhoneme(x, htsNote)).ToArray(); + foreach (int i in Enumerable.Range(0, htsPhonemes.Length)) { + htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); + htsPhonemes[i].position = i + 1; + htsPhonemes[i].position_backward = htsPhonemes.Length - i; + if (htsPhonemes[i].type.Equals("c")) { + int prev = i - 1; + if (prev >= 0) { + if (htsPhonemes[prev].type.Equals("v")) { + htsPhonemes[i].prev_vowel_distance = 1; + } else if (htsPhonemes[prev].prev_vowel_distance > 0) { + htsPhonemes[i].prev_vowel_distance = htsPhonemes[prev].prev_vowel_distance + 1; + } else { + htsPhonemes[i].prev_vowel_distance = 0; + } + } + } + } + for (int i = htsPhonemes.Length - 1; i >= 0; --i) { + if (htsPhonemes[i].type.Equals("c")) { + int next = i + 1; + if (next < htsPhonemes.Length) { + if (htsPhonemes[next].type.Equals("v")) { + htsPhonemes[i].next_vowel_distance = 1; + } else if (htsPhonemes[next].next_vowel_distance > 0) { + htsPhonemes[i].next_vowel_distance = htsPhonemes[next].next_vowel_distance + 1; + } else { + htsPhonemes[i].next_vowel_distance = 0; + } + } + } + } + return htsPhonemes; + } + + protected abstract HTSPhoneme[] CustomHTSPhonemeContext(HTSPhoneme[] htsPhonemes, RenderNote notes); + + private struct monoLabel { + public string symbol; + public double startMs; + public double endMs; + public override string ToString() { + return $"{(long)Math.Round(startMs * 10000.0)} {(long)Math.Round(endMs * 10000.0)} {symbol}"; + } + } + + public void ProcessPart(RenderPhrase phrase) { + if (timeAxis == null) { + timeAxis = phrase.timeAxis; + } + + int startTick = phrase.position; + int endTick = phrase.position + phrase.duration; + + // パディングを小節長で設定(開始・終了ともに1小節) + var sigStart = timeAxis.TimeSignatureAtTick(startTick); + var bpmStart = timeAxis.GetBpmAtTick(startTick); + var barLenMsStart = (int)Math.Round(60000.0 / bpmStart * sigStart.beatPerBar); + var barLenTicksStart = timeAxis.MsPosToTickPos(barLenMsStart); + + var sigEnd = timeAxis.TimeSignatureAtTick(endTick); + var bpmEnd = timeAxis.GetBpmAtTick(endTick); + var barLenMsEnd = (int)Math.Round(60000.0 / bpmEnd * sigEnd.beatPerBar); + var barLenTicksEnd = timeAxis.MsPosToTickPos(barLenMsEnd); + + // 文全体の長さ(開始1小節 + 本体 + 終了1小節) + double sentenceDurMs = headMs + phrase.endMs - phrase.positionMs + tailMs; + int sentenceDurTicks = barLenTicksStart + (endTick - startTick) + barLenTicksEnd; + + // 先頭パディング pau + timeAxis.TickPosToBarBeat(startTick - barLenTicksStart, out int barStart, out int beatStart, out int _); + var sigForPadStart = timeAxis.TimeSignatureAtTick(startTick - barLenTicksStart); + + + List monoLabels_ = new List(); + //double phonemeDuration = 0; + + HTSNote PaddingNoteStart = new HTSNote( + symbols: new string[] { "pau" }, + beatPerBar: sigForPadStart.beatPerBar, + beatUnit: sigForPadStart.beatUnit, + positionBar: barStart, + positionBeat: beatStart, + key: key, + bpm: timeAxis.GetBpmAtTick(startTick - barLenTicksStart), + tone: 0, + isSlur: false, + isRest: true, + lang: string.Empty, + accent: string.Empty, + startms: 0, + endms: headMs, + positionTicks: startTick - barLenTicksStart, + durationTicks: barLenTicksStart + ); + var htsNotes = new List { PaddingNoteStart }; + var htsPhonemes = new List(); + htsPhonemes.AddRange(HTSNoteToPhonemes(PaddingNoteStart)); + + monoLabels_.Add(new monoLabel() { + symbol = htsPhonemes[0].symbol, + startMs = 0, + endMs = headMs + }); + //phonemeDuration += headMs; + + //Alignment + var phonemesByNoteIndex = phrase.phones + .GroupBy(phone => phone.noteIndex) + .ToDictionary( + group => group.Key, + group => group.Select(phone => phone).ToArray()); + var lastBasePhonemes = Array.Empty(); + var tuples = new List>(); + for (int noteIndex = 0; noteIndex < phrase.notes.Length; noteIndex++) { + var note = phrase.notes[noteIndex]; + if (phonemesByNoteIndex.TryGetValue(noteIndex, out var phonemes)) { + foreach (var phone in phonemes) { + var phoneStartMs = headMs + (phone.positionMs - phrase.positionMs); + var phoneEndMs = headMs + (phone.endMs - phrase.positionMs); + var lastMonoLabel = monoLabels_[^1]; + if (phoneStartMs < lastMonoLabel.endMs) { + lastMonoLabel.endMs = phoneStartMs; + monoLabels_[^1] = lastMonoLabel; + } + monoLabels_.Add(new monoLabel() { + symbol = phone.phoneme, + startMs = phoneStartMs, + endMs = phoneEndMs + }); + } + + lastBasePhonemes = phonemes; + HTSNote htsNote = makeHtsNote(phonemes.Select(phone => phone.phoneme).ToArray(), note, startTick, headMs); + tuples.Add(Tuple.Create(htsNote, noteIndex)); + } else if (IsSyllableVowelExtensionNote(note)) { + // 拍点延長ノートは、直前の通常ノートの最後の母音を引き延ばす + var extensionPhoneme = FindLastVowelOrLastPhoneme(lastBasePhonemes); + if (!string.IsNullOrEmpty(extensionPhoneme.phoneme)) { + var htsNote = makeHtsNote(extensionPhoneme.phoneme, note, startTick, headMs); + var extensionStartMs = htsNote.startMs; + var extensionEndMs = htsNote.endMs; + + var lastMonoLabel = monoLabels_[^1]; + if (lastMonoLabel.symbol == extensionPhoneme.phoneme && + lastMonoLabel.startMs < extensionStartMs && + extensionStartMs < lastMonoLabel.endMs) { + lastMonoLabel.endMs = extensionStartMs; + monoLabels_[^1] = lastMonoLabel; + } + + monoLabels_.Add(new monoLabel() { + symbol = extensionPhoneme.phoneme, + startMs = extensionStartMs, + endMs = extensionEndMs + }); + + tuples.Add(Tuple.Create(htsNote, noteIndex)); + } + } else { + continue; + } + } + int noteCount = tuples.Count; + for (int i = 0; i < tuples.Count; i++) { + var htsNote = tuples[i].Item1; + htsNotes.Add(htsNote); + htsNote.index = i + 1; + htsNote.indexBackwards = noteCount - i; + htsNote.sentenceDurMs = sentenceDurMs; + htsNote.sentenceDurTicks = sentenceDurTicks; + var tmpPhonemes = HTSNoteToPhonemes(htsNote); + var notePhonemes = CustomHTSPhonemeContext(tmpPhonemes, phrase.notes[tuples[i].Item2]) ?? tmpPhonemes; + htsPhonemes.AddRange(notePhonemes); + } + // 終端パディング pau(位置は「本当の曲末」tick) + timeAxis.TickPosToBarBeat(endTick, out int barEnd, out int beatEnd, out int _); + HTSNote PaddingNoteEnd = new HTSNote( + symbols: new string[] { "pau" }, + beatPerBar: sigEnd.beatPerBar, + beatUnit: sigEnd.beatUnit, + positionBar: barEnd, + positionBeat: beatEnd, + key: key, + bpm: bpmEnd, + tone: 0, + isSlur: false, + isRest: true, + lang: string.Empty, + accent: string.Empty, + // 絶対msで末尾に配置 + startms: sentenceDurMs - tailMs, + endms: sentenceDurMs, + positionTicks: endTick, + durationTicks: barLenTicksEnd + ); + htsNotes.Add(PaddingNoteEnd); + htsPhonemes.AddRange(HTSNoteToPhonemes(PaddingNoteEnd)); + + monoLabels_.Add(new monoLabel() { + symbol = htsPhonemes[^1].symbol, + startMs = sentenceDurMs - tailMs, + endMs = sentenceDurMs + }); + + var htsPhrase = new HTSPhrase(htsNotes.ToArray()); + htsPhrase.UpdateResolution(resolution); + htsPhrase.totalNotes = htsNotes.Count - 1; + htsPhrase.totalPhonemes = htsPhonemes.Count - 1; + htsPhrase.totalPhrases = 1; + //make neighborhood links between htsNotes and between htsPhonemes + foreach (int i in Enumerable.Range(0, htsNotes.Count)) { + htsNotes[i].parent = htsPhrase; + if (i > 0) { + htsNotes[i].prev = htsNotes[i - 1]; + htsNotes[i - 1].next = htsNotes[i]; + } + } + for (int i = 1; i < htsPhonemes.Count; ++i) { + htsPhonemes[i].prev = htsPhonemes[i - 1]; + htsPhonemes[i - 1].next = htsPhonemes[i]; + } + + try { + File.WriteAllLines(fullScorePath, htsPhonemes.Select(x => x.dump())); + File.WriteAllLines(monoTimingPath, monoLabels_.Select(x => x.ToString())); + } catch (Exception e) { + Log.Error(e.ToString()); + throw; + } + } + + public virtual RenderResult Layout(RenderPhrase phrase) { + if (timeAxis == null) { + timeAxis = phrase.timeAxis; + } + startTick = phrase.position; + endTick = phrase.position + phrase.duration; + + // パディングを小節長で設定(開始・終了ともに1小節) + sigStart = timeAxis.TimeSignatureAtTick(startTick); + bpmStart = timeAxis.GetBpmAtTick(startTick); + headMs = (int)Math.Round((60000.0 / bpmStart) * sigStart.beatPerBar); + + sigEnd = timeAxis.TimeSignatureAtTick(endTick); + bpmEnd = timeAxis.GetBpmAtTick(endTick); + tailMs = (int)Math.Round((60000.0 / bpmEnd) * sigEnd.beatPerBar); + return new RenderResult() { + leadingMs = headMs, + positionMs = phrase.positionMs, + estimatedLengthMs = headMs + phrase.durationMs + tailMs, + }; + } + + public abstract Task Render(RenderPhrase phrase, Progress progress, int trackNo, CancellationTokenSource cancellation, bool isPreRender); + + public abstract UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSettings renderSettings); + + public abstract override string ToString(); + + public abstract RenderPitchResult LoadRenderedPitch(RenderPhrase phrase); + } +} diff --git a/OpenUtau.Core/Render/RenderPhrase.cs b/OpenUtau.Core/Render/RenderPhrase.cs index 048fd64f6..bf302843e 100644 --- a/OpenUtau.Core/Render/RenderPhrase.cs +++ b/OpenUtau.Core/Render/RenderPhrase.cs @@ -72,13 +72,13 @@ public class RenderPhone { public readonly bool direct; public readonly Vector2[] envelope; - // voicevox & enunu args + // voicevox & enunu & neutrino args public readonly int toneShift; public readonly UOto oto; public readonly ulong hash; - internal RenderPhone(UProject project, UTrack track, UVoicePart part, UNote note, UPhoneme phoneme, int phrasePosition) { + internal RenderPhone(UProject project, UTrack track, UVoicePart part, UNote note, UPhoneme phoneme, int phrasePosition, int noteIndex) { position = part.position + phoneme.position - phrasePosition; duration = phoneme.Duration; end = position + duration; @@ -90,6 +90,7 @@ internal RenderPhone(UProject project, UTrack track, UVoicePart part, UNote note this.phoneme = phoneme.phoneme; tone = note.tone; + this.noteIndex = noteIndex; tempos = project.timeAxis.TemposBetweenTicks(part.position + phoneme.position - leading, part.position + phoneme.End); UTempo[] noteTempos = project.timeAxis.TemposBetweenTicks(part.position + phoneme.position, part.position + phoneme.End); tempo = noteTempos.Length > 0 ? noteTempos[0].bpm : project.tempos[0].bpm; @@ -211,12 +212,10 @@ internal RenderPhrase(UProject project, UTrack track, UVoicePart part, IEnumerab uNotes.Add(next); next = next.Next; } - if (uNotes.First().Prev != null && uNotes.First().Prev.End == uNotes.First().position) { - uNotes.Insert(0, uNotes.First().Prev); - } - if (uNotes.Last().Next != null && uNotes.Last().End == uNotes.Last().Next.position) { - uNotes.Add(uNotes.Last().Next); - } + + var noteIndexes = uNotes + .Select((note, index) => new { note, index }) + .ToDictionary(x => x.note, x => x.index); singer = track.Singer; renderer = track.RendererSettings.Renderer; @@ -231,7 +230,7 @@ internal RenderPhrase(UProject project, UTrack track, UVoicePart part, IEnumerab .Select(n => new RenderNote(project, part, n, position)) .ToArray(); phones = phonemes - .Select(p => new RenderPhone(project, track, part, p.Parent, p, position)) + .Select(p => new RenderPhone(project, track, part, p.Parent, p, position, noteIndexes[p.Parent])) .ToArray(); leading = phones.First().leading; diff --git a/OpenUtau.Core/Util/HTS.cs b/OpenUtau.Core/Util/HTS.cs new file mode 100644 index 000000000..cb55436fc --- /dev/null +++ b/OpenUtau.Core/Util/HTS.cs @@ -0,0 +1,769 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using OpenUtau.Core.Ustx; + +//This file implement utaupy.hts python library's function +//https://github.com/oatsu-gh/utaupy/hts.py + +//HTS labels use b instead of # +//In HTS labels, "xx" is a preserved keyword that means null +namespace OpenUtau.Core.Util { + public static class HTS { + public static readonly string[] KeysInOctave = { + "C", + "Db", + "D", + "Eb", + "E", + "F", + "Gb", + "G", + "Ab", + "A", + "Bb", + "B" , + }; + + public static readonly Dictionary NameInOctave = new Dictionary { + { "C", 0 }, { "C#", 1 }, { "Db", 1 }, + { "D", 2 }, { "D#", 3 }, { "Eb", 3 }, + { "E", 4 }, + { "F", 5 }, { "F#", 6 }, { "Gb", 6 }, + { "G", 7 }, { "G#", 8 }, { "Ab", 8 }, + { "A", 9 }, { "A#", 10 }, { "Bb", 10 }, + { "B", 11 }, + }; + + public static string GetToneName(int noteNum) { + return noteNum < 0 ? string.Empty : KeysInOctave[noteNum % 12] + (noteNum / 12 - 1).ToString(); + } + + public static string GetOctaveNum(int noteNum) { + return noteNum < 0 ? string.Empty : (noteNum / 12 - 1).ToString(); + } + + //return -1 if error + public static int NameToTone(string name) { + if (name.Length < 2) { + return -1; + } + var str = name.Substring(0, (name[1] == '#' || name[1] == 'b') ? 2 : 1); + var num = name.Substring(str.Length); + if (!int.TryParse(num, out int octave)) { + return -1; + } + if (!NameInOctave.TryGetValue(str, out int inOctave)) { + return -1; + } + return 12 * (octave + 1) + inOctave; + } + + public static string WriteInt(int integer) { + return (integer >= 0 ? "p" : "m") + Math.Abs(integer).ToString(); + } + } + + public static class HTSContextBuilder { + public static bool HasPauseLikePhoneme(IEnumerable symbols, Func isPauseLike) { + return symbols.Any(symbol => isPauseLike(symbol.ToLowerInvariant())); + } + + public static HTSNote BuildNote( + string[] symbols, + int tone, + bool isSlur, + string lang, + int key, + TimeAxis timeAxis, + int noteStartTick, + int noteEndTick, + int phraseStartTick, + int startMsOffset, + Func isPauseLike) { + UTimeSignature sig = timeAxis.TimeSignatureAtTick(noteStartTick); + timeAxis.TickPosToBarBeat(noteStartTick, out int bar, out int beat, out int _); + var isRest = HasPauseLikePhoneme(symbols, isPauseLike); + return new HTSNote( + symbols: symbols, + tone: tone, + isSlur: isSlur, + isRest: isRest, + lang: isRest ? string.Empty : lang, + accent: string.Empty, + beatPerBar: sig.beatPerBar, + beatUnit: sig.beatUnit, + positionBar: bar, + positionBeat: beat, + key: key, + bpm: timeAxis.GetBpmAtTick(noteStartTick), + startms: (int)timeAxis.MsBetweenTickPos(phraseStartTick, noteStartTick) + startMsOffset, + endms: (int)timeAxis.MsBetweenTickPos(phraseStartTick, noteEndTick) + startMsOffset, + positionTicks: noteStartTick, + durationTicks: noteEndTick - noteStartTick); + } + + public static int FindFirstVowelIndex(IReadOnlyList symbols, Func isVowel) { + for (int i = 0; i < symbols.Count; i++) { + if (isVowel(symbols[i])) { + return i; + } + } + return 0; + } + + public static List AlignTimingPositions( + IReadOnlyList durationsMs, + IReadOnlyList> phAlignPoints) { + var positions = new List(); + if (durationsMs.Count == 0 || phAlignPoints.Count == 0) { + return positions; + } + var firstCount = Math.Max(0, phAlignPoints[0].Item1 - 1); + var initialGroup = durationsMs.Take(firstCount).ToList(); + positions.AddRange(Stretch(initialGroup, 1, phAlignPoints[0].Item2)); + foreach (var pair in phAlignPoints.Zip(phAlignPoints.Skip(1), Tuple.Create)) { + var currAlignPoint = pair.Item1; + var nextAlignPoint = pair.Item2; + var count = nextAlignPoint.Item1 - currAlignPoint.Item1; + if (count <= 0) { + continue; + } + var alignGroup = durationsMs.Skip(currAlignPoint.Item1).Take(count).ToList(); + if (alignGroup.Count == 0) { + continue; + } + var sum = alignGroup.Sum(); + var ratio = sum == 0 ? 0 : (nextAlignPoint.Item2 - currAlignPoint.Item2) / sum; + positions.AddRange(Stretch(alignGroup, ratio, nextAlignPoint.Item2)); + } + return positions; + } + + public static List> BuildAlignedNoteTimingResult( + IReadOnlyList phonemes, + int startIndex, + int endIndex, + IReadOnlyList positionsMs, + double notePosMs, + Func ticksBetweenMsPos) { + var noteResult = new List>(); + for (int phIndex = startIndex; phIndex < endIndex; ++phIndex) { + if (phIndex < 0 || phIndex >= phonemes.Count) { + continue; + } + var phoneme = phonemes[phIndex]; + if (string.IsNullOrEmpty(phoneme)) { + continue; + } + var positionIndex = phIndex - 1; + if (positionIndex < 0 || positionIndex >= positionsMs.Count) { + continue; + } + noteResult.Add(Tuple.Create( + phoneme, + ticksBetweenMsPos(notePosMs, positionsMs[positionIndex]))); + } + return noteResult; + } + + public static List Stretch(IList source, double ratio, double endPos) { + double startPos = endPos - source.Sum() * ratio; + var result = CumulativeSum(source.Select(x => x * ratio).Prepend(0), startPos).ToList(); + result.RemoveAt(result.Count - 1); + return result; + } + + public static IEnumerable CumulativeSum(IEnumerable sequence, double start = 0) { + double sum = start; + foreach (var item in sequence) { + sum += item; + yield return sum; + } + } + } + + public class HTSPhoneme { + public string symbol; + public string flag1 = "xx"; + public string flag2 = "xx"; + + //Links to this phoneme's neighbors and parent + public HTSPhoneme? prev; + public HTSPhoneme? next; + public HTSNote parent; + + //informations about this phoneme + //v:vowel, c:consonant, p:pause, s:silence, b:break + public string type = "xx"; + //(number of phonemes before this phoneme in this note) + 1 + public int position = 1; + //(number of phonemes after this phoneme in this note) + 1 + public int position_backward = 1; + //Here -1 means null + //distances to vowels in this note, -1 for vowels themselves + public int prev_vowel_distance = 0; + public int next_vowel_distance = 0; + + public HTSPhoneme(string phoneme, HTSNote note) { + this.symbol = phoneme; + this.parent = note; + } + + public HTSPhoneme? beforePrev { + get { + if (prev == null) { return null; } else { return prev.prev; } + } + } + + public HTSPhoneme? afterNext { + get { + if (next == null) { return null; } else { return next.next; } + } + } + + public string dump() { + //Write phoneme as an HTS line + // 100ns単位出力時にintオーバーフローを避けるためlongへ + string result = + $"{(long)Math.Round(parent.startMs * 10000.0)} {(long)Math.Round(parent.endMs * 10000.0)} " + //Phoneme informations + + string.Format("{0}@{1}^{2}-{3}+{4}={5}_{6}%{7}^{8}_{9}~{10}-{11}!{12}[{13}${14}]{15}", p()) + //Syllable informations + + string.Format("/A:{0}-{1}-{2}@{3}~{4}", a()) + + string.Format("/B:{0}_{1}_{2}@{3}|{4}", b()) + + string.Format("/C:{0}+{1}+{2}@{3}&{4}", c()) + //Note informations + + string.Format("/D:{0}!{1}#{2}${3}%{4}|{5}&{6};{7}-{8}", d()) + + string.Format( + "/E:{0}]{1}^{2}={3}~{4}!{5}@{6}#{7}+{8}]{9}${10}|{11}[{12}&{13}]{14}={15}^{16}~{17}#{18}_{19};{20}${21}&{22}%{23}[{24}|{25}]{26}-{27}^{28}+{29}~{30}={31}@{32}${33}!{34}%{35}#{36}|{37}|{38}-{39}&{40}&{41}+{42}[{43};{44}]{45};{46}~{47}~{48}^{49}^{50}@{51}[{52}#{53}={54}!{55}~{56}+{57}!{58}^{59}", + e()) + + string.Format("/F:{0}#{1}#{2}-{3}${4}${5}+{6}%{7};{8}", f()) + + string.Format("/G:{0}_{1}", g()) + + string.Format("/H:{0}_{1}", h()) + + string.Format("/I:{0}_{1}", i()) + + string.Format("/J:{0}~{1}@{2}", j()) + ; + return result; + } + + public string[] p() { + var result = Enumerable.Repeat("xx", 16).ToArray(); + result[0] = type; + result[1] = (beforePrev == null) ? "xx" : beforePrev.symbol; + result[2] = (prev == null) ? "xx" : prev.symbol; + result[3] = symbol; + result[4] = (next == null) ? "xx" : next.symbol; + result[5] = (afterNext == null) ? "xx" : afterNext.symbol; + result[6] = (beforePrev == null) ? "xx" : beforePrev.flag1; + result[7] = (prev == null) ? "xx" : prev.flag1; + result[8] = flag1; + result[9] = (next == null) ? "xx" : next.flag1; + result[10] = (afterNext == null) ? "xx" : afterNext.flag1; + result[11] = position.ToString(); + result[12] = position_backward.ToString(); + result[13] = prev_vowel_distance == 0 ? "xx" : prev_vowel_distance.ToString(); + result[14] = next_vowel_distance == 0 ? "xx" : next_vowel_distance.ToString(); + result[15] = flag2; + + return result; + } + + public string[] a() { + return parent.a(); + } + + public string[] b() { + return parent.b(); + } + + public string[] c() { + return parent.c(); + } + + public string[] d() { + return parent.d(); + } + + public string[] e() { + return parent.e(); + } + + public string[] f() { + return parent.f(); + } + + public string[] g() { + return parent.g(); + } + + public string[] h() { + return parent.h(); + } + + public string[] i() { + return parent.i(); + } + + public string[] j() { + return parent.j(); + } + } + + // TODO: Keep HTS note-context generation centralized here. + // Remaining E-context slots that stay "xx" today should only be filled after + // their HTS/NEUTRINO semantics are confirmed against the target implementation. + public class HTSNote { + public double startMs = 0; + public double endMs = 0; + public int positionTicks; + public int durationTicks = 0; + public int index = 0;//index of this note in sentence + public int indexBackwards = 0; + public double sentenceDurMs = 0; + public int sentenceDurTicks = 0; + public double startMsPercent = 0; + + //TimeSignatures + public int beatPerBar = 0; + public int beatUnit = 0; + + public int positionBar = 1; //bar number in the sentence, starting from 1 + public int positionBeat = 1; //unit number in the bar, starting from 1 + + public double key = 0; + public double bpm = 0; + public int tone = 0; + public bool isSlur = false; + public bool isRest = true; + public string[] symbols; + public string lang = string.Empty; + public string langDependent = "xx"; + public string accent = string.Empty; + + public HTSNote? prev; + public HTSNote? next; + public HTSPhrase parent; + + public HTSNote(string[] symbols, int beatPerBar, int beatUnit, int positionBar, int positionBeat, int key, double bpm, int tone, bool isSlur, bool isRest, string lang, string accent, double startms, double endms, int positionTicks, int durationTicks) { + this.startMs = startms; + this.endMs = endms; + this.beatPerBar = beatPerBar; + this.beatUnit = beatUnit; + this.positionBar = positionBar; + this.positionBeat = positionBeat; + this.key = key; + this.bpm = bpm; + this.tone = tone; + this.isSlur = isSlur; + this.isRest = isRest; + this.lang = lang; + this.accent = accent; + this.symbols = symbols; + this.positionTicks = positionTicks; + this.durationTicks = durationTicks; + } + + public double durationMs { + get { return endMs - startMs; } + } + + private double startMsBackwards { + get { return sentenceDurMs - startMs; } + } + + private int positionTickBackwards { + get { return sentenceDurTicks - positionTicks; } + } + + + public int? measureIndexForward; + public double? measureMsForward; + public int? measureTickForward; + public int? measurePercentForward; + public int? measureIndexBackward; + public double? measureMsBackward; + public int? measureTickBackward; + public int? measurePercentBackward; + + public int? accentIndexForward; + public double? accentMsForward; + public int? accentTickForward; + public int? accentIndexBackward; + public double? accentMsBackward; + public int? accentTickBackward; + + public string[] a() { + if (prev == null) { + return Enumerable.Repeat("xx", 5).ToArray(); + } else if (prev.isRest) { + return Enumerable.Repeat("xx", 5).ToArray(); + } else { + return prev.b(); + } + } + + public string[] b() { + return new string[] { + symbols.Length.ToString(), + "1", + "1", + lang != string.Empty ? lang : "xx", + langDependent, + }; + } + + public string[] c() { + if (next == null) { + return Enumerable.Repeat("xx", 5).ToArray(); + } else if (next.isRest) { + return Enumerable.Repeat("xx", 5).ToArray(); + } else { + return next.b(); + } + } + + public string[] d() { + if (prev == null) { + return Enumerable.Repeat("xx", 60).ToArray(); + } else if (prev.isRest) { + return Enumerable.Repeat("xx", 60).ToArray(); + } else { + return prev.e(); + } + } + + public string[] e() { + var result = Enumerable.Repeat("xx", 60).ToArray(); + result[0] = isRest ? "xx" : HTS.GetToneName(tone); + result[1] = isRest ? "xx" : HTS.GetOctaveNum(tone); + result[2] = ((int)Math.Round(key)).ToString(); + result[3] = $"{beatPerBar}/{beatUnit}"; + result[4] = ((int)Math.Round(bpm)).ToString(); + result[5] = "1"; + + int lengthCs = Math.Max(0, (int)Math.Round(durationMs / 10.0)); + int ticksPer96th = (parent != null && parent.resolution > 0) ? parent.resolution / 24 : 0; + int length96 = (ticksPer96th > 0) ? (int)Math.Round((double)durationTicks / ticksPer96th) : 0; + result[6] = lengthCs.ToString(); + result[7] = length96.ToString(); + + result[9] = measureIndexForward != null ? measureIndexForward.ToString() : "xx"; // e10 + result[10] = measureIndexBackward != null ? measureIndexBackward.ToString() : "xx"; // e11 + result[11] = measureMsForward != null ? ((int)Math.Round(measureMsForward.Value)).ToString() : "xx"; // e12 (centisecond already) + result[12] = measureMsBackward != null ? ((int)Math.Round(measureMsBackward.Value)).ToString() : "xx"; // e13 + result[13] = measureTickForward != null ? measureTickForward.ToString() : "xx"; // e14 (96th already) + result[14] = measureTickBackward != null ? measureTickBackward.ToString() : "xx"; // e15 + result[15] = measurePercentForward != null ? measurePercentForward.ToString() : "xx"; // e16 + result[16] = measurePercentBackward != null ? measurePercentBackward.ToString() : "xx"; // e17 + + if (!isRest) { + result[17] = index <= 0 ? "xx" : index.ToString(); + result[18] = indexBackwards <= 0 ? "xx" : indexBackwards.ToString(); + result[19] = ((int)Math.Round(startMs / 10)).ToString(); // 10ms単位 + result[20] = ((int)Math.Round(startMsBackwards / 10)).ToString(); + + // e22/e23: phrase-level position by 96th note, resolution independent + if (ticksPer96th > 0 && parent != null && parent.notes != null && index > 0) { + int firstPhraseTick = parent.notes + .Select(note => note.positionTicks) + .DefaultIfEmpty(positionTicks) + .Min(); + int lastPhraseTick = parent.notes + .Select(note => note.positionTicks) + .DefaultIfEmpty(positionTicks) + .Max(); + int forwardTicks = Math.Max(0, positionTicks - firstPhraseTick); + int backwardTicks = Math.Max(0, lastPhraseTick - positionTicks); + result[21] = ((forwardTicks + ticksPer96th / 2) / ticksPer96th).ToString(); + result[22] = ((backwardTicks + ticksPer96th / 2) / ticksPer96th).ToString(); + } else { + result[21] = "xx"; + result[22] = "xx"; + } + + int totalNotes = parent?.totalNotes ?? 0; + if (totalNotes > 1) { + result[23] = ((index - 1) * 100 / (totalNotes - 1)).ToString(); + result[24] = ((indexBackwards - 1) * 100 / (totalNotes - 1)).ToString(); + } else { + result[23] = "xx"; + result[24] = "xx"; + } + + } + + if (prev != null) { + result[25] = prev.isSlur && isSlur ? "1" : "0"; + } else { + result[25] = "0"; + } + if (next != null) { + result[26] = next.isSlur && isSlur ? "1" : "0"; + } else { + result[26] = "0"; + } + result[27] = "n"; + result[28] = accentIndexBackward.HasValue ? accentIndexBackward.Value.ToString() : "xx"; + result[29] = accentIndexForward.HasValue ? accentIndexForward.Value.ToString() : "xx"; + result[30] = accentMsBackward.HasValue ? ((int)Math.Round(accentMsBackward.Value / 10.0)).ToString() : "xx"; + result[31] = accentMsForward.HasValue ? ((int)Math.Round(accentMsForward.Value / 10.0)).ToString() : "xx"; + result[32] = (accentTickBackward.HasValue && ticksPer96th > 0) ? ((int)Math.Round((double)accentTickBackward.Value / ticksPer96th)).ToString() : "xx"; + result[33] = (accentTickForward.HasValue && ticksPer96th > 0) ? ((int)Math.Round((double)accentTickForward.Value / ticksPer96th)).ToString() : "xx"; + + // TODO: e34-e56 remain intentionally "xx" until OpenUtau adopts a + // verified mapping for staccato / crescendo / decrescendo related + // score-label contexts. Keep current behavior visible instead of + // guessing values from timing-label-only information. + + if (!isRest && this.tone > 0) { + result[56] = (prev == null || prev.isRest || prev.tone <= 0) ? "xx" : HTS.WriteInt(prev.tone - tone); + result[57] = (next == null || next.isRest || next.tone <= 0) ? "xx" : HTS.WriteInt(next.tone - tone); + } else { + result[56] = "xx"; + result[57] = "xx"; + } + return result; + } + + public string[] f() { + if (next == null) { + return Enumerable.Repeat("xx", 60).ToArray(); + } else if (next.isRest) { + return Enumerable.Repeat("xx", 60).ToArray(); + } else { + return next.e(); + } + } + + public string[] g() { + //TODO Calculate using HTSPhrase + if (prev != null) { + if (isRest) { + return prev.h(); + } + } + return parent.g(); + } + + public string[] h() { + // TODO Calculate using HTSPhrase + if (isRest) { + return Enumerable.Repeat("xx", 2).ToArray(); + } + return parent.h(); + } + + public string[] i() { + //TODO Calculate using HTSPhrase + if (next != null) { + if (isRest) { + return next.h(); + } + } + return parent.i(); + } + + public string[] j() { + return parent.j(); + } + } + + public class HTSPhrase { + public int resolution = 480; + public int totalPhrases; + public int totalNotes; + public int totalPhonemes; + + public HTSPhrase? prev; + public HTSPhrase? next; + public HTSNote[] notes; + + public HTSPhrase(HTSNote[] notes) { + this.notes = notes; + RecalculateDerivedContexts(); + } + + public void UpdateResolution(int resolution) { + this.resolution = resolution; + RecalculateDerivedContexts(); + } + + void RecalculateDerivedContexts() { + foreach (var note in notes) { + note.accentIndexForward = null; + note.accentMsForward = null; + note.accentTickForward = null; + note.accentIndexBackward = null; + note.accentMsBackward = null; + note.accentTickBackward = null; + note.measureIndexForward = null; + note.measureMsForward = null; + note.measureTickForward = null; + note.measurePercentForward = null; + note.measureIndexBackward = null; + note.measureMsBackward = null; + note.measureTickBackward = null; + note.measurePercentBackward = null; + } + + // アクセント(forward) + int accentIndexForwardSum = 0; + double accentMsForwardSum = 0; + int accentTickForwardSum = 0; + for (int i = 0; i < notes.Length; i++) { + var note = notes[i]; + if (note.isRest) { + accentIndexForwardSum = 0; + accentMsForwardSum = 0; + accentTickForwardSum = 0; + } else if (!string.IsNullOrEmpty(note.accent)) { + note.accentIndexForward = 0; + note.accentMsForward = 0; + note.accentTickForward = 0; + + accentIndexForwardSum = 1; + accentMsForwardSum = note.durationMs; + accentTickForwardSum = note.durationTicks; + } else { + if (accentIndexForwardSum != 0) { + note.accentIndexForward = accentIndexForwardSum; + accentIndexForwardSum += 1; + } + if (accentMsForwardSum != 0) { + note.accentMsForward = accentMsForwardSum; + accentMsForwardSum += note.durationMs; + } + if (accentTickForwardSum != 0) { + note.accentTickForward = accentTickForwardSum; + accentTickForwardSum += note.durationTicks; + } + } + } + + // アクセント(backward) + int accentIndexBackwardSum = 0; + double accentMsBackwardSum = 0; + int accentTickBackwardSum = 0; + int lastAccentIndexContribution = 0; + double lastAccentMs = 0; + int lastAccentTicks = 0; + for (int i = notes.Length - 1; i >= 0; i--) { + var note = notes[i]; + if (note.isRest) { + accentIndexBackwardSum = 0; + accentMsBackwardSum = 0; + accentTickBackwardSum = 0; + lastAccentIndexContribution = 0; + lastAccentMs = 0; + lastAccentTicks = 0; + } else if (!string.IsNullOrEmpty(note.accent)) { + note.accentIndexBackward = Math.Max(0, accentIndexBackwardSum - lastAccentIndexContribution); + note.accentMsBackward = Math.Max(0, accentMsBackwardSum - lastAccentMs); + note.accentTickBackward = Math.Max(0, accentTickBackwardSum - lastAccentTicks); + + lastAccentIndexContribution = 1; + lastAccentMs = note.durationMs; + lastAccentTicks = note.durationTicks; + + accentIndexBackwardSum = 1; + accentMsBackwardSum = note.durationMs; + accentTickBackwardSum = note.durationTicks; + } else { + if (accentIndexBackwardSum != 0) { + note.accentIndexBackward = accentIndexBackwardSum; + accentIndexBackwardSum += 1; + } + if (accentMsBackwardSum != 0) { + note.accentMsBackward = accentMsBackwardSum; + accentMsBackwardSum += note.durationMs; + } + if (accentTickBackwardSum != 0) { + note.accentTickBackward = accentTickBackwardSum; + accentTickBackwardSum += note.durationTicks; + } + + } + } + + // 小節ごとのグルーピング(positionBar 基準) + var groups = notes + .GroupBy(n => n.positionBar) + .OrderBy(g => g.Key) + .Select(g => g.OrderBy(n => n.positionTicks).ToList()) + .ToList(); + + int ticksPer96th = (resolution > 0) ? (resolution / 24) : 0; + + foreach (var group in groups) { + double totalDurationMs = group.Sum(n => n.durationMs); + int totalDurationTicks = group.Sum(n => n.durationTicks); + int totalNotesInMeasure = group.Count; + // forward(小節先頭からの位置) + double accMsF = 0; + int accTicksF = 0; + for (var noteIndex = 0; noteIndex < group.Count; noteIndex++) { + var note = group[noteIndex]; + note.measureIndexForward = noteIndex + 1; + note.measureMsForward = (int)Math.Round(accMsF / 100.0); + note.measureTickForward = ticksPer96th > 0 ? (int)Math.Round((double)accTicksF / ticksPer96th) : 0; + note.measurePercentForward = totalNotesInMeasure > 1 ? (noteIndex * 100) / (totalNotesInMeasure - 1) : 0; + + accMsF += note.durationMs; + accTicksF += note.durationTicks; + } + + // backward + double accMsB = 0; + int accTicksB = 0; + for (int noteIndex = group.Count - 1; noteIndex >= 0; --noteIndex) { + var note = group[noteIndex]; + int backwardIndex = group.Count - noteIndex; + note.measureIndexBackward = backwardIndex; + note.measureMsBackward = (int)Math.Round(accMsB / 100.0); + note.measureTickBackward = ticksPer96th > 0 ? (int)Math.Round((double)accTicksB / ticksPer96th) : 0; + note.measurePercentBackward = totalNotesInMeasure > 1 ? ((backwardIndex - 1) * 100) / (totalNotesInMeasure - 1) : 0; + + accMsB += note.durationMs; + accTicksB += note.durationTicks; + } + } + } + private int barCount { + get { return notes[^1].positionBar - notes[0].positionBar + 1; } + } + + public string[] g() { + var result = Enumerable.Repeat("xx", 2).ToArray(); + if (prev == null) { + return result; + } else { + return prev.h(); + } + } + + public string[] h() { + var result = Enumerable.Repeat("xx", 2).ToArray(); + result[0] = notes.Length.ToString(); + result[1] = notes.Select(note => note.symbols.Length).Sum().ToString(); + return result; + } + + public string[] i() { + var result = Enumerable.Repeat("xx", 2).ToArray(); + if (next == null) { + return result; + } else { + return next.h(); + } + } + + public string[] j() { + var result = Enumerable.Repeat("xx", 3).ToArray(); + result[0] = (barCount > 0 ? (totalNotes / barCount).ToString() : "xx"); + result[1] = (barCount > 0 ? (totalPhonemes / barCount).ToString() : "xx"); + result[2] = totalPhrases.ToString(); + return result; + } + } +} diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/HTSLabelFile.cs b/OpenUtau.Core/Util/HTSLabelFile.cs similarity index 99% rename from OpenUtau.Plugin.Builtin/EnunuOnnx/HTSLabelFile.cs rename to OpenUtau.Core/Util/HTSLabelFile.cs index 87fd0028a..944c4f844 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/HTSLabelFile.cs +++ b/OpenUtau.Core/Util/HTSLabelFile.cs @@ -5,10 +5,10 @@ using System.Collections; using System.IO; using System.Text.RegularExpressions; -using OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.python; +using OpenUtau.Core.Util.nnmnkwii.python; //reference: https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/io/hts.py -namespace OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.io.hts { +namespace OpenUtau.Core.Util.nnmnkwii.io.hts { public class HTSLabel { public int start_time = 0; public int end_time = 0; diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/Merlin.cs b/OpenUtau.Core/Util/Merlin.cs similarity index 98% rename from OpenUtau.Plugin.Builtin/EnunuOnnx/Merlin.cs rename to OpenUtau.Core/Util/Merlin.cs index cfb874872..59da9dc0d 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/Merlin.cs +++ b/OpenUtau.Core/Util/Merlin.cs @@ -2,10 +2,11 @@ using System.Collections.Generic; using System.Text.RegularExpressions; using System.Linq; -using OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.io.hts; +using OpenUtau.Core.Util; +using OpenUtau.Core.Util.nnmnkwii.io.hts; //reference: https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/frontend/merlin.py -namespace OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.frontend { +namespace OpenUtau.Core.Util.nnmnkwii.frontend { public class merlin { //TODO:Should subphone_features be an enum? static Dictionary frame_feature_size_dict = new Dictionary diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/Python.cs b/OpenUtau.Core/Util/Python.cs similarity index 94% rename from OpenUtau.Plugin.Builtin/EnunuOnnx/Python.cs rename to OpenUtau.Core/Util/Python.cs index 9a27970d1..a8306a96d 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/Python.cs +++ b/OpenUtau.Core/Util/Python.cs @@ -1,7 +1,7 @@ using System; using System.Text.RegularExpressions; -namespace OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.python { +namespace OpenUtau.Core.Util.nnmnkwii.python { public class AssertionError : Exception { public AssertionError() : base() { } diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/Scaler.cs b/OpenUtau.Core/Util/Scaler.cs similarity index 97% rename from OpenUtau.Plugin.Builtin/EnunuOnnx/Scaler.cs rename to OpenUtau.Core/Util/Scaler.cs index 6201fcfe5..39a5b303f 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/Scaler.cs +++ b/OpenUtau.Core/Util/Scaler.cs @@ -4,7 +4,7 @@ using System.Text; using Newtonsoft.Json; -namespace OpenUtau.Plugin.Builtin.EnunuOnnx { +namespace OpenUtau.Core.Util { public class ScalerLine { public float xmin; public float scale; diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs b/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs index 8c3b74a07..d3f3a1ccc 100644 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs +++ b/OpenUtau.Plugin.Builtin/EnunuOnnx/EnunuOnnxPhonemizer.cs @@ -9,9 +9,10 @@ using OpenUtau.Api; using OpenUtau.Core; using OpenUtau.Core.Ustx; +using OpenUtau.Core.Util; +using OpenUtau.Core.Util.nnmnkwii.frontend; +using OpenUtau.Core.Util.nnmnkwii.io.hts; using OpenUtau.Plugin.Builtin.EnunuOnnx; -using OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.frontend; -using OpenUtau.Plugin.Builtin.EnunuOnnx.nnmnkwii.io.hts; using Serilog; //This phonemizer is a pure C# implemention of the ENUNU phonemizer, @@ -280,9 +281,21 @@ string[] GetSymbols(Note note) { //make a HTS Note from given symbols and UNotes protected HTSNote makeHtsNote(string[] symbols, IList group, int startTick) { + UTimeSignature sig = timeAxis.TimeSignatureAtTick(group[0].position); + timeAxis.TickPosToBarBeat(group[0].position, out int bar, out int beat, out int remainingTicks); return new HTSNote( symbols: symbols, tone: group[0].tone, + isSlur: IsSyllableVowelExtensionNote(group[0]), + isRest: symbols.Select(x => x.ToLowerInvariant()).Any(x => pauses.Contains(x) || silences.Contains(x) || breaks.Contains(x)), + beatPerBar: sig.beatPerBar, + beatUnit: sig.beatUnit, + positionBar: bar, + positionBeat: beat, + key: 0, + lang: string.Empty, + accent: string.Empty, + bpm: timeAxis.GetBpmAtTick(group[0].position), startms: (int)timeAxis.MsBetweenTickPos(startTick, group[0].position) + paddingMs, endms: (int)timeAxis.MsBetweenTickPos(startTick, group[^1].position + group[^1].duration) + paddingMs, positionTicks: group[0].position, @@ -439,26 +452,36 @@ protected virtual HTSNote[] MakeSyllables(Note[] inputNotes, int startTick) { HTSPhoneme[] HTSNoteToPhonemes(HTSNote htsNote) { var htsPhonemes = htsNote.symbols.Select(x => new HTSPhoneme(x, htsNote)).ToArray(); - int prevVowelPos = -1; - foreach (int i in Enumerable.Range(0, htsPhonemes.Length)) { + foreach (var i in Enumerable.Range(0, htsPhonemes.Length)) { + htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); htsPhonemes[i].position = i + 1; htsPhonemes[i].position_backward = htsPhonemes.Length - i; - htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); - if (htsPhonemes[i].type == "v") { - prevVowelPos = i; - } else { - if (prevVowelPos > 0) { - htsPhonemes[i].distance_from_previous_vowel = i - prevVowelPos; + } + foreach (var i in Enumerable.Range(0, htsPhonemes.Length)) { + if (htsPhonemes[i].type.Equals("c")) { + var prev = i - 1; + if (prev >= 0) { + if (htsPhonemes[prev].type.Equals("v")) { + htsPhonemes[i].prev_vowel_distance = 1; + } else if (htsPhonemes[prev].prev_vowel_distance > 0) { + htsPhonemes[i].prev_vowel_distance = htsPhonemes[prev].prev_vowel_distance + 1; + } else { + htsPhonemes[i].prev_vowel_distance = 0; + } } } } - int nextVowelPos = -1; - for (int i = htsPhonemes.Length - 1; i > 0; --i) { - if (htsPhonemes[i].type == "v") { - nextVowelPos = i; - } else { - if (nextVowelPos > 0) { - htsPhonemes[i].distance_to_next_vowel = nextVowelPos - i; + for (var i = htsPhonemes.Length - 1; i >= 0; --i) { + if (htsPhonemes[i].type.Equals("c")) { + var next = i + 1; + if (next < htsPhonemes.Length) { + if (htsPhonemes[next].type.Equals("v")) { + htsPhonemes[i].next_vowel_distance = 1; + } else if (htsPhonemes[next].next_vowel_distance > 0) { + htsPhonemes[i].next_vowel_distance = htsPhonemes[next].next_vowel_distance + 1; + } else { + htsPhonemes[i].next_vowel_distance = 0; + } } } } @@ -473,9 +496,21 @@ void ProcessPart(Note[][] phrase) { int paddingTicks = timeAxis.MsPosToTickPos(paddingMs); var notePhIndex = new List { 1 };//每个音符的第一个音素在音素列表上对应的位置 var phAlignPoints = new List>();//音素对齐的位置,Ms,绝对时间 + UTimeSignature sig = timeAxis.TimeSignatureAtTick(phrase[0][0].position - paddingTicks); + timeAxis.TickPosToBarBeat(phrase[0][0].position - paddingTicks, out int bar, out int beat, out int remainingTicks); HTSNote PaddingNote = new HTSNote( - symbols: new string[] { "sil" }, + symbols: new string[] { defaultPause }, + beatPerBar: sig.beatPerBar, + beatUnit: sig.beatUnit, + positionBar: bar, + positionBeat: beat, + key: 0, + bpm: 0, tone: 0, + isSlur: false, + isRest: true, + lang: string.Empty, + accent: string.Empty, startms: 0, endms: paddingMs, positionTicks: phrase[0][0].position - paddingTicks, @@ -515,11 +550,12 @@ void ProcessPart(Note[][] phrase) { htsPhonemes.Count, timeAxis.TickPosToMsPos(lastNote.positionTicks + lastNote.durationTicks))); + var htsPhrase = new HTSPhrase(htsNotes.ToArray()); + htsPhrase.totalNotes = htsNotes.Count; + htsPhrase.totalPhonemes = htsPhonemes.Count; //make neighborhood links between htsNotes and between htsPhonemes foreach (int i in Enumerable.Range(0, htsNotes.Count)) { - htsNotes[i].index = i; - htsNotes[i].indexBackwards = htsNotes.Count - i; - htsNotes[i].sentenceDurMs = sentenceDurMs; + htsNotes[i].parent = htsPhrase; if (i > 0) { htsNotes[i].prev = htsNotes[i - 1]; htsNotes[i - 1].next = htsNotes[i]; diff --git a/OpenUtau.Plugin.Builtin/EnunuOnnx/HTS.cs b/OpenUtau.Plugin.Builtin/EnunuOnnx/HTS.cs deleted file mode 100644 index ffe0750b6..000000000 --- a/OpenUtau.Plugin.Builtin/EnunuOnnx/HTS.cs +++ /dev/null @@ -1,256 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; - -//This file implement utaupy.hts python library's function -//https://github.com/oatsu-gh/utaupy/blob/master/utaupy/hts.py - -//HTS labels use b instead of # -//In HTS labels, "xx" is a preserved keyword that means null -namespace OpenUtau.Plugin.Builtin.EnunuOnnx { - public static class HTS { - public static readonly string[] KeysInOctave = { - "C", - "Db", - "D", - "Eb", - "E", - "F", - "Gb", - "G", - "Ab", - "A", - "Bb", - "B" , - }; - - public static readonly Dictionary NameInOctave = new Dictionary { - { "C", 0 }, { "C#", 1 }, { "Db", 1 }, - { "D", 2 }, { "D#", 3 }, { "Eb", 3 }, - { "E", 4 }, - { "F", 5 }, { "F#", 6 }, { "Gb", 6 }, - { "G", 7 }, { "G#", 8 }, { "Ab", 8 }, - { "A", 9 }, { "A#", 10 }, { "Bb", 10 }, - { "B", 11 }, - }; - - public static string GetToneName(int noteNum) { - return noteNum < 0 ? string.Empty : KeysInOctave[noteNum % 12] + (noteNum / 12 - 1).ToString(); - } - - //return -1 if error - public static int NameToTone(string name) { - if (name.Length < 2) { - return -1; - } - var str = name.Substring(0, (name[1] == '#' || name[1] == 'b') ? 2 : 1); - var num = name.Substring(str.Length); - if (!int.TryParse(num, out int octave)) { - return -1; - } - if (!NameInOctave.TryGetValue(str, out int inOctave)) { - return -1; - } - return 12 * (octave + 1) + inOctave; - } - - //write integer with "p" as positive and "n" as negative. 0 is "p0" - public static string WriteInt(int integer) { - return (integer >= 0 ? "p":"m" )+Math.Abs(integer).ToString(); - } - } - - public class HTSPhoneme{ - public string symbol; - - //Links to this phoneme's neighbors and parent - public HTSPhoneme? prev; - public HTSPhoneme? next; - public HTSNote parent; - - //informations about this phoneme - //v:vowel, c:consonant, p:pause, s:silence, b:break - public string type = "xx"; - //(number of phonemes before this phoneme in this note) + 1 - public int position = 1; - //(number of phonemes after this phoneme in this note) + 1 - public int position_backward = 1; - //Here -1 means null - //distances to vowels in this note, -1 for vowels themselves - public int distance_from_previous_vowel = -1; - public int distance_to_next_vowel = -1; - - public HTSPhoneme(string phoneme, HTSNote note) { - this.symbol = phoneme; - this.parent = note; - } - - public HTSPhoneme? beforePrev { - get { - if (prev == null) { return null; } else { return prev.prev;} - } - } - - public HTSPhoneme? afterNext { - get { - if (next == null) { return null; } else { return next.next; } - } - } - - public string dump() { - //Write phoneme as an HTS line - - string result = - $"{parent.startMs * 100000} {parent.endMs * 100000} " - //Phoneme informations - + string.Format("{0}@{1}^{2}-{3}+{4}={5}_{6}%{7}^{8}_{9}~{10}-{11}!{12}[{13}${14}]{15}", p()) - //Syllable informations - + string.Format("/A:{0}-{1}-{2}@{3}~{4}", a()) - + string.Format("/B:{0}_{1}_{2}@{3}|{4}", b()) - + string.Format("/C:{0}+{1}+{2}@{3}&{4}", c()) - //Note informations - + string.Format("/D:{0}!{1}#{2}${3}%{4}|{5}&{6};{7}-{8}", d()) - + string.Format( - "/E:{0}]{1}^{2}={3}~{4}!{5}@{6}#{7}+{8}]{9}${10}|{11}[{12}&{13}]{14}={15}^{16}~{17}#{18}_{19};{20}${21}&{22}%{23}[{24}|{25}]{26}-{27}^{28}+{29}~{30}={31}@{32}${33}!{34}%{35}#{36}|{37}|{38}-{39}&{40}&{41}+{42}[{43};{44}]{45};{46}~{47}~{48}^{49}^{50}@{51}[{52}#{53}={54}!{55}~{56}+{57}!{58}^{59}", - e()) - +string.Format("/F:{0}#{1}#{2}-{3}${4}${5}+{6}%{7};{8}",f()) - + "/G:xx_xx/H:xx_xx/I:xx_xx/J:xx~xx@1" - ; - return result; - } - - public string[] p() { - var result = Enumerable.Repeat("xx",16).ToArray(); - result[0] = type; - result[1] = (beforePrev == null) ? "xx" : beforePrev.symbol; - result[2] = (prev == null) ? "xx" : prev.symbol; - result[3] = symbol; - result[4] = (next == null) ? "xx" : next.symbol; - result[5] = (afterNext == null) ? "xx" : afterNext.symbol; - result[11] = position.ToString(); - result[12] = position_backward.ToString(); - result[13] = distance_from_previous_vowel < 0 ? "xx" : distance_from_previous_vowel.ToString(); - result[14] = distance_to_next_vowel < 0 ? "xx" : distance_to_next_vowel.ToString(); - return result; - } - - public string[] a() { - return parent.a(); - } - - public string[] b() { - return parent.b(); - } - - public string[] c() { - return parent.c(); - } - - public string[] d() { - return parent.d(); - } - - public string[] e() { - return parent.e(); - } - - public string[] f() { - return parent.f(); - } - } - - //TODO - public class HTSNote { - public int startMs = 0; - public int endMs = 0; - public int positionTicks; - public int durationTicks = 0; - public int index = 0;//index of this note in sentence - public int indexBackwards = 0; - public int sentenceDurMs = 0; - - public int tone = 0; - public string[] symbols; - - public HTSNote? prev; - public HTSNote? next; - - public HTSNote(string[] symbols, int tone, int startms,int endms,int positionTicks, int durationTicks) { - this.startMs = startms; - this.endMs = endms; - this.tone = tone; - this.symbols = symbols; - this.positionTicks = positionTicks; - this.durationTicks = durationTicks; - } - - public int durationMs { - get { return endMs - startMs; } - } - - public int startMsBackwards { - get { return sentenceDurMs - startMs; } - } - - public string[] b() { - return new string[] { - symbols.Length.ToString(), - "1", - "1", - "xx", - "xx" - }; - } - - public string[] a() { - if (prev == null) { - return Enumerable.Repeat("xx", 5).ToArray(); - } else { - return prev.b(); - } - } - - public string[] c() { - if (next == null) { - return Enumerable.Repeat("xx", 5).ToArray(); - } else { - return next.b(); - } - } - - public string[] e() { - var result = Enumerable.Repeat("xx", 60).ToArray(); - result[0] = HTS.GetToneName(tone); - result[5] = "1";//number_of_syllables - result[6] = ((durationMs + 5) / 10).ToString();//duration in 10ms - result[7] = ((durationTicks + 10) / 20).ToString(); //length in 96th note, or 20 ticks - result[17] = index <= 0 ? "xx" : index.ToString();//index of note in sentence - result[18] = indexBackwards <= 0 ? "xx" : indexBackwards.ToString(); - result[19] = ((startMs + 50) / 100).ToString();//position in 100ms - result[20] = ((startMsBackwards + 50) / 100).ToString(); - if (this.tone > 0) { - result[56] = (prev == null || prev.tone <= 0) ? "p0" : HTS.WriteInt(prev.tone - tone); - result[57] = (next == null || next.tone <= 0) ? "p0" : HTS.WriteInt(next.tone - tone); - } else { - result[56] = "p0"; - result[57] = "p0"; - } - return result; - } - - public string[] d() { - if(prev == null) { - return Enumerable.Repeat("xx", 60).ToArray(); - } else { - return prev.e(); - } - } - public string[] f() { - if (next == null) { - return Enumerable.Repeat("xx", 60).ToArray(); - } else { - return next.e(); - } - } - } -} diff --git a/OpenUtau.Test/Core/Util/HtsSpecTests.cs b/OpenUtau.Test/Core/Util/HtsSpecTests.cs new file mode 100644 index 000000000..a54682df9 --- /dev/null +++ b/OpenUtau.Test/Core/Util/HtsSpecTests.cs @@ -0,0 +1,311 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Reflection; +using System.Text; +using System.Text.RegularExpressions; +using OpenUtau.Core.Ustx; +using OpenUtau.Core.Util.nnmnkwii.frontend; +using OpenUtau.Core.Util.nnmnkwii.io.hts; +using Xunit; + +namespace OpenUtau.Core.Util { + public class HtsSpecTests { + private static readonly Regex CurrentPhonemePattern = new(@"^[^@]+@[^\^]+\^[^-]+-(?[^+]+)\+", RegexOptions.Compiled); + protected Dictionary phoneDict = new Dictionary(); + protected List vowels = new List() {"a","i","u","e","o" }; + protected List consonants = new List() {"k","s","t","n","h","m","y","r","w","g","z","d","b","p" }; + protected List breaks = new List(); + protected List pauses = new List() { "pau", "sil" }; + protected List silences = new List(); + protected List unvoiced = new List(); + + private string GetPhonemeType(string phoneme) { + if (phoneme == "xx") { + return "xx"; + } + if (vowels.Contains(phoneme)) { + return "v"; + } + if (pauses.Contains(phoneme)) { + return "p"; + } + if (silences.Contains(phoneme)) { + return "s"; + } + if (breaks.Contains(phoneme)) { + return "b"; + } + //if (unvoiced.Contains(phoneme)) { + // return "c"; + //} + return "c"; + } + + private HTSNote MakeNote(int startMs, int endMs, int positionTicks, int durationTicks, int positionBar, string accent = "") { + var symbols = new[] { "a" }; + var beatPerBar = 4; + var beatUnit = 4; + var key = 0; + double bpm = 120; + var tone = 60; // C4 + var isSlur = false; + var isRest = false; + var lang = "JPN"; + var accentStr = accent; + var note = new HTSNote(symbols, beatPerBar, beatUnit, positionBar, 0, key, bpm, tone, isSlur, isRest, lang, accentStr, startMs, endMs, positionTicks, durationTicks); + return note; + } + + private HTSPhrase BuildPhrase(HTSNote[] notes, int resolution) { + var phrase = new HTSPhrase(notes); + phrase.UpdateResolution(resolution); + var sentenceDurMs = notes.Sum(n => n.durationMs); + var sentenceDurTicks = notes.Sum(n => n.durationTicks); + for (var i = 0; i < notes.Length; i++) { + var n = notes[i]; + n.parent = phrase; + n.index = i + 1; + n.indexBackwards = notes.Length - i; + n.sentenceDurMs = sentenceDurMs; + n.sentenceDurTicks = sentenceDurTicks; + if (i > 0) { + notes[i - 1].next = n; + n.prev = notes[i - 1]; + } + } + return phrase; + } + + private TimeAxis BuildDefaultTimeAxis() { + var timeAxis = new TimeAxis(); + var project = new UProject(); + timeAxis.BuildSegments(project); + return timeAxis; + } + + [Fact] + public void MeasureForwardBackwardAreComputedPerBar() { + var res = 480; // ticks per quarter + var ticksPer96 = res / 24; // 20 + var n0 = MakeNote(0, 1000, 0, 480, 0); + var n1 = MakeNote(1000, 2000, 480, 480, 0); + var n2 = MakeNote(2000, 3000, 960, 480, 0); + var phrase = BuildPhrase(new[] { n0, n1, n2 }, res); + + var e0 = n0.e(); + var e1 = n1.e(); + var e2 = n2.e(); + + // forward index (e10) + Assert.Equal("1", e0[9]); + Assert.Equal("2", e1[9]); + Assert.Equal("3", e2[9]); + // backward index (e11) + Assert.Equal("3", e0[10]); + Assert.Equal("2", e1[10]); + Assert.Equal("1", e2[10]); + + // forward ms in centiseconds (e12) + Assert.Equal("0", e0[11]); + Assert.Equal("10", e1[11]); + Assert.Equal("20", e2[11]); + // backward ms in centiseconds (e13) + Assert.Equal("20", e0[12]); + Assert.Equal("10", e1[12]); + Assert.Equal("0", e2[12]); + + // forward 96th (e14) + Assert.Equal("0", e0[13]); + Assert.Equal((480 / ticksPer96).ToString(), e1[13]); + Assert.Equal((960 / ticksPer96).ToString(), e2[13]); + // backward 96th (e15) + Assert.Equal((960 / ticksPer96).ToString(), e0[14]); + Assert.Equal((480 / ticksPer96).ToString(), e1[14]); + Assert.Equal("0", e2[14]); + + // forward percent (e16) + Assert.Equal("0", e0[15]); + Assert.Equal("50", e1[15]); + Assert.Equal("100", e2[15]); + // backward percent (e17) + Assert.Equal("100", e0[16]); + Assert.Equal("50", e1[16]); + Assert.Equal("0", e2[16]); + } + + [Fact] + public void AccentDistancesForwardBackward() { + var res = 480; + var ticksPer96 = res / 24; // 20 + var n0 = MakeNote(0, 1000, 0, 480, 0, accent: ""); + var n1 = MakeNote(1000, 2000, 480, 480, 0, accent: "A"); + var n2 = MakeNote(2000, 3000, 960, 480, 0, accent: ""); + var n3 = MakeNote(3000, 4000, 1440, 480, 0, accent: "A"); + var phrase = BuildPhrase(new[] { n0, n1, n2, n3 }, res); + + var e0 = n0.e(); + var e1 = n1.e(); + var e2 = n2.e(); + var e3 = n3.e(); + + // For n2 (between accents): distances should be 1 note, 100 cs, 24 (96th) + Assert.Equal("1", e2[28]); // next accent (notes) + Assert.Equal("1", e2[29]); // prev accent (notes) + Assert.Equal("100", e2[30]); // next accent (cs) + Assert.Equal("100", e2[31]); // prev accent (cs) + Assert.Equal((480 / ticksPer96).ToString(), e2[32]); // next (96th) + Assert.Equal((480 / ticksPer96).ToString(), e2[33]); // prev (96th) + + // For n1 (accent): prev distance is 0, next accent is one note away (n2) + Assert.Equal("1", e1[28]); // next accent (n3 via one note n2) + Assert.Equal("0", e1[29]); // prev accent (itself) + Assert.Equal("100", e1[30]); // next accent (cs) + Assert.Equal("0", e1[31]); // prev accent (cs) + } + + [Fact] + public void NoteToPhonemesKeepsSharedNoteTiming() { + var note = new HTSNote( + new[] { "k", "a", "pau" }, + 4, + 4, + 0, + 0, + 0, + 120, + 60, + false, + false, + "JPN", + string.Empty, + 120, + 360, + 0, + 480); + + var htsPhonemes = note.symbols.Select(x => new HTSPhoneme(x, note)).ToArray(); + int prevVowelPos = -1; + foreach (int i in Enumerable.Range(0, htsPhonemes.Length)) { + htsPhonemes[i].position = i + 1; + htsPhonemes[i].position_backward = htsPhonemes.Length - i; + htsPhonemes[i].type = GetPhonemeType(htsPhonemes[i].symbol); + if (htsPhonemes[i].type == "v") { + prevVowelPos = i; + } else { + if (prevVowelPos > 0) { + htsPhonemes[i].prev_vowel_distance = i - prevVowelPos; + } + } + } + int nextVowelPos = -1; + for (int i = htsPhonemes.Length - 1; i > 0; --i) { + if (htsPhonemes[i].type == "v") { + nextVowelPos = i; + } else { + if (nextVowelPos > 0) { + htsPhonemes[i].next_vowel_distance = nextVowelPos - i; + } + } + } + + Assert.Equal(3, htsPhonemes.Length); + Assert.All(htsPhonemes, phoneme => Assert.Same(note, phoneme.parent)); + Assert.All(htsPhonemes, phoneme => Assert.Equal(120, phoneme.parent.startMs)); + Assert.All(htsPhonemes, phoneme => Assert.Equal(360, phoneme.parent.endMs)); + Assert.Equal(new[] { 1, 2, 3 }, htsPhonemes.Select(phoneme => phoneme.position).ToArray()); + Assert.Equal(new[] { 3, 2, 1 }, htsPhonemes.Select(phoneme => phoneme.position_backward).ToArray()); + Assert.Equal(new[] { "c", "v", "p" }, htsPhonemes.Select(phoneme => phoneme.type).ToArray()); + Assert.Equal(1, htsPhonemes[2].prev_vowel_distance); + } + + [Fact] + public void PhraseResolutionUpdateRecomputesMeasureTicks() { + var note0 = MakeNote(0, 1000, 0, 960, 0); + var note1 = MakeNote(1000, 2000, 960, 960, 0); + var phrase = new HTSPhrase(new[] { note0, note1 }); + note0.parent = phrase; + note1.parent = phrase; + note0.index = 1; + note1.index = 2; + note0.indexBackwards = 2; + note1.indexBackwards = 1; + note0.next = note1; + note1.prev = note0; + note0.sentenceDurMs = 2000; + note1.sentenceDurMs = 2000; + note0.sentenceDurTicks = 1920; + note1.sentenceDurTicks = 1920; + + phrase.UpdateResolution(960); + + var e1 = note1.e(); + Assert.Equal("24", e1[13]); + Assert.Equal("24", e1[21]); + } + + [Fact] + public void RestNoteMasksPitchFields() { + var rest = MakeNote(0, 500, 0, 480, 0); + rest.isRest = true; + rest.tone = 0; + + var phrase = BuildPhrase(new[] { rest }, 480); + var e = rest.e(); + + Assert.Equal("xx", e[0]); + Assert.Equal("xx", e[1]); + Assert.Equal("xx", e[56]); + Assert.Equal("xx", e[57]); + } + + [Fact] + public void PitchDifferenceToRestNeighborsUsesXx() { + var restStart = MakeNote(0, 500, 0, 480, 0); + restStart.isRest = true; + restStart.tone = 0; + var note = MakeNote(500, 1000, 480, 480, 0); + var restEnd = MakeNote(1000, 1500, 960, 480, 0); + restEnd.isRest = true; + restEnd.tone = 0; + + var phrase = BuildPhrase(new[] { restStart, note, restEnd }, 480); + var e = note.e(); + + Assert.Equal("xx", e[56]); + Assert.Equal("xx", e[57]); + } + + [Fact] + public void AlignTimingPositionsFollowsAnchorPoints() { + var durations = new[] { 20d, 10d, 30d }; + var alignPoints = new[] { + Tuple.Create(1, 100d), + Tuple.Create(3, 160d), + }; + + var positions = HTSContextBuilder.AlignTimingPositions(durations, alignPoints); + + Assert.Equal(2, positions.Count); + Assert.Equal(100d, positions[0]); + Assert.Equal(115d, positions[1]); + } + + [Fact] + public void BuildAlignedNoteTimingResultReturnsNoteRelativeTicks() { + var result = HTSContextBuilder.BuildAlignedNoteTimingResult( + new[] { "pau", "a", "b", "c" }, + 1, + 4, + new[] { 80d, 100d, 120d }, + 50d, + (start, end) => (int)Math.Round(end - start)); + + Assert.Equal(3, result.Count); + Assert.Equal(Tuple.Create("a", 30), result[0]); + Assert.Equal(Tuple.Create("b", 50), result[1]); + Assert.Equal(Tuple.Create("c", 70), result[2]); + } + } +} diff --git a/OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs b/OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs new file mode 100644 index 000000000..97b9b1893 --- /dev/null +++ b/OpenUtau.Test/Plugins/HtsLabelPhonemizerTest.cs @@ -0,0 +1,252 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Reflection; +using System.Text; +using OpenUtau.Api; +using OpenUtau.Classic; +using OpenUtau.Core; +using OpenUtau.Core.Format; +using OpenUtau.Core.Hts; +using OpenUtau.Core.Ustx; +using OpenUtau.Core.Util; +using OpenUtau.Core.Util.nnmnkwii.frontend; +using OpenUtau.Core.Util.nnmnkwii.io.hts; +using Xunit; +using Xunit.Abstractions; + +namespace OpenUtau.Plugins { + // Minimal concrete HTSLabelPhonemizer for testing without external aligners. + class DummyHtsLabelPhonemizer : HTSLabelPhonemizer { + public string GeneratedFullScorePath => fullScorePath; + public string GeneratedMonoTimingPath => monoTimingPath; + public string GeneratedTempPath => htstmpPath; + public bool DictionaryLoadedBeforeG2p { get; private set; } + + public DummyHtsLabelPhonemizer() { + // Minimal language and symbol classes + lang = "JPN"; + vowels = new List { "a", "i", "u", "e", "o" }; + pauses = new List { "pau" }; + silences = new List { "sil" }; + breaks = new List { "br" }; + tablePath = "oto.ini"; + } + + protected override IG2p LoadG2p(string rootPath) { + DictionaryLoadedBeforeG2p = phoneDict.Count > 0; + // Provide a tiny JP-like dictionary: simple CV mapping. + var builder = G2pDictionary.NewBuilder(); + // vowels + builder.AddSymbol("a", true); + builder.AddSymbol("i", true); + builder.AddSymbol("u", true); + builder.AddSymbol("e", true); + builder.AddSymbol("o", true); + // consonants + var cons = new[] { "k", "s", "t", "n", "h", "m", "y", "r", "w" }; + foreach (var c in cons) builder.AddSymbol(c, false); + // pauses etc + builder.AddSymbol("pau", false); + builder.AddSymbol("sil", false); + builder.AddSymbol("br", false); + // single vowels + builder.AddEntry("a", new[] { "a" }); + builder.AddEntry("i", new[] { "i" }); + builder.AddEntry("u", new[] { "u" }); + builder.AddEntry("e", new[] { "e" }); + builder.AddEntry("o", new[] { "o" }); + // CV (subset) + builder.AddEntry("ka", new[] { "k", "a" }); + builder.AddEntry("ki", new[] { "k", "i" }); + builder.AddEntry("ku", new[] { "k", "u" }); + builder.AddEntry("ke", new[] { "k", "e" }); + builder.AddEntry("ko", new[] { "k", "o" }); + builder.AddEntry("ta", new[] { "t", "a" }); + builder.AddEntry("ti", new[] { "t", "i" }); + builder.AddEntry("to", new[] { "t", "o" }); + builder.AddEntry("na", new[] { "n", "a" }); + builder.AddEntry("ni", new[] { "n", "i" }); + builder.AddEntry("no", new[] { "n", "o" }); + builder.AddEntry("ma", new[] { "m", "a" }); + builder.AddEntry("mi", new[] { "m", "i" }); + builder.AddEntry("mo", new[] { "m", "o" }); + builder.AddEntry("ra", new[] { "r", "a" }); + builder.AddEntry("ri", new[] { "r", "i" }); + builder.AddEntry("ro", new[] { "r", "o" }); + return builder.Build(); + } + + protected override HTSNote CustomHTSNoteContext(HTSNote htsNote, Phonemizer.Note note) { + return htsNote; // no-op + } + + protected override HTSPhoneme[] CustomHTSPhonemeContext(HTSPhoneme[] htsPhonemes, Phonemizer.Note[] notes) { + return htsPhonemes; // no-op + } + + protected override Phonemizer.Note[][] PhraseAdjustments(Phonemizer.Note[][] phrese) { + return phrese; // no-op + } + + protected override void SendScore(Phonemizer.Note[][] phrase) { + // Create a fake mono_timing.lab with uniform 100ms durations for each phoneme in full_score.lab + if (!Directory.Exists(htstmpPath)) { + Directory.CreateDirectory(htstmpPath); + } + int count = 0; + if (File.Exists(fullScorePath)) { + count = File.ReadLines(fullScorePath).Count(); + } + long start = 0; + var lines = new List(count); + for (int i = 0; i < count; i++) { + long end = start + 1_000_000; // 100ms in 100ns units + lines.Add($"{start} {end} a"); + start = end; + } + File.WriteAllLines(monoTimingPath, lines); + } + } + + public class HtsLabelPhonemizerTest : PhonemizerTestBase { + public HtsLabelPhonemizerTest(ITestOutputHelper output) : base(output) { } + + protected override Phonemizer CreatePhonemizer() { + return new DummyHtsLabelPhonemizer(); + } + + [Theory] + [InlineData(new string[] { "a" }, new string[] { "a" })] + [InlineData(new string[] { "a", "i" }, new string[] { "a", "i" })] + [InlineData(new string[] { "a", "+~a", "i" }, new string[] { "a", "i" })] // extension note should not duplicate symbols + // JP CV + [InlineData(new string[] { "ka" }, new string[] { "k", "a" })] + [InlineData(new string[] { "ka", "ki" }, new string[] { "k", "a", "k", "i" })] + [InlineData(new string[] { "ka", "+~a", "ki" }, new string[] { "k", "a", "k", "i" })] + public void BasicHtsPipelineTest(string[] lyrics, string[] aliases) { + SameAltsTonesColorsTest("en_delta0", lyrics, aliases, "", "C4", ""); + } + + [Fact] + public void DictionaryIsLoadedBeforeG2p() { + var phonemizer = CreateConfiguredPhonemizer(new[] { "a" }); + + Assert.True(phonemizer.DictionaryLoadedBeforeG2p); + } + + [Fact] + public void GeneratedLabelsCanDriveFrontendAndSimpleSynthesis() { + var phonemizer = CreateConfiguredPhonemizer(new[] { "ka", "ki", "ro" }); + + Assert.True(File.Exists(phonemizer.GeneratedFullScorePath)); + Assert.True(File.Exists(phonemizer.GeneratedMonoTimingPath)); + + var questionPath = WriteMinimalQuestionSet(phonemizer.GeneratedTempPath); + var questionSet = hts.load_question_set(questionPath, encoding: Encoding.UTF8); + var fullLabels = hts.load(phonemizer.GeneratedFullScorePath, Encoding.UTF8); + var monoLabels = hts.load(phonemizer.GeneratedMonoTimingPath, Encoding.UTF8); + var features = merlin.linguistic_features(fullLabels, questionSet.Item1, questionSet.Item2); + + Assert.Equal(fullLabels.Count, monoLabels.Count); + Assert.Equal(fullLabels.Count, features.Count); + Assert.All(features, feature => { + Assert.Single(feature); + Assert.Equal(1f, feature[0]); + }); + + var waveform = SynthesizeFromLabels(monoLabels, features, 16000); + + Assert.NotEmpty(waveform); + Assert.Contains(waveform, sample => Math.Abs(sample) > 0.0001f); + } + + DummyHtsLabelPhonemizer CreateConfiguredPhonemizer(string[] lyrics) { + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + var dir = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + var basePath = Path.Join(dir, "Files"); + var file = Path.Join(basePath, "en_delta0", "character.txt"); + + VoicebankLoader.IsTest = true; + var voicebank = new Voicebank() { File = file, BasePath = dir }; + VoicebankLoader.LoadVoicebank(voicebank); + var singer = new ClassicSinger(voicebank); + singer.EnsureLoaded(); + + var project = new UProject(); + Ustx.AddDefaultExpressions(project); + var track = project.tracks[0]; + project.expressions.TryGetValue(Ustx.CLR, out var descriptor); + track.VoiceColorExp = descriptor.Clone(); + var colors = singer.Subbanks.Select(subbank => subbank.Color).ToHashSet(); + track.VoiceColorExp.options = colors.OrderBy(color => color).ToArray(); + track.VoiceColorExp.max = track.VoiceColorExp.options.Length - 1; + + var timeAxis = new TimeAxis(); + timeAxis.BuildSegments(project); + + var phonemizer = new DummyHtsLabelPhonemizer(); + phonemizer.Testing = true; + phonemizer.SetSinger(singer); + phonemizer.SetTiming(timeAxis); + phonemizer.SetUp(BuildGroups(lyrics), project, track); + return phonemizer; + } + + Phonemizer.Note[][] BuildGroups(string[] lyrics) { + var groups = new List(); + int position = 240; + foreach (var lyric in lyrics) { + groups.Add(new[] { + new Phonemizer.Note { + lyric = lyric, + duration = 240, + position = position, + tone = Core.MusicMath.NameToTone("C4"), + phonemeAttributes = new[] { + new Phonemizer.PhonemeAttributes { + index = 0, + consonantStretchRatio = 1, + voiceColor = string.Empty, + } + }, + } + }); + position += 240; + } + return groups.ToArray(); + } + + string WriteMinimalQuestionSet(string directory) { + var questionPath = Path.Combine(directory, "test-minimal.qst"); + File.WriteAllLines(questionPath, new[] { + "QS \"ALL\" {*}", + }); + return questionPath; + } + + float[] SynthesizeFromLabels(HTSLabelFile monoLabels, List> features, int sampleRate) { + Assert.True(monoLabels.Count > 0); + long totalDuration = monoLabels[^1].end_time; + int totalSamples = (int)Math.Ceiling(totalDuration / 10_000_000.0 * sampleRate); + var waveform = new float[totalSamples]; + for (int index = 0; index < monoLabels.Count; index++) { + var label = monoLabels[index]; + Assert.True(label.end_time > label.start_time); + if (index > 0) { + Assert.Equal(monoLabels[index - 1].end_time, label.start_time); + } + int startSample = (int)Math.Round(label.start_time / 10_000_000.0 * sampleRate); + int endSample = Math.Min(totalSamples, (int)Math.Round(label.end_time / 10_000_000.0 * sampleRate)); + float amplitude = 0.05f + 0.05f * features[index].Sum(); + float frequency = 220f + 30f * index; + for (int sample = startSample; sample < endSample; sample++) { + float time = sample / (float)sampleRate; + waveform[sample] = amplitude * (float)Math.Sin(2 * Math.PI * frequency * time); + } + } + return waveform; + } + } +}