From ff7ed660d0e7fac29f4b4bf1be56495d96108dec Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 13:52:29 -0400 Subject: [PATCH 01/12] Fix CMake macOS SDK and BLAS/LAPACK cache after Xcode upgrades When the cached sysroot path is missing, set CMAKE_OSX_SYSROOT from xcrun. Before add_subdirectory(src), clear cached FindBLAS, FindLAPACK, and MATH paths that reference a removed SDK or a different SDK tree than the active sysroot. Remove duplicate find_library(MATH_LIBRARY) in src/CMakeLists.txt. --- CMakeLists.txt | 88 ++++++++++++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 1 - 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 86789c6a..bbf8e4c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,28 @@ cmake_minimum_required(VERSION 3.14) # I like pie +# After an Xcode upgrade, CMakeCache may keep CMAKE_OSX_SYSROOT pointing at a removed +# SDK; the link step then fails (e.g. missing .../usr/lib/libm.tbd). Reset to the +# active SDK when the cached path is missing (must run before project()). +if(APPLE) + set(_sphinxtrain_sysroot_ok FALSE) + if(CMAKE_OSX_SYSROOT AND EXISTS "${CMAKE_OSX_SYSROOT}") + set(_sphinxtrain_sysroot_ok TRUE) + endif() + if(NOT _sphinxtrain_sysroot_ok) + execute_process( + COMMAND xcrun --sdk macosx --show-sdk-path + OUTPUT_VARIABLE _sphinxtrain_osx_sdk + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + RESULT_VARIABLE _sphinxtrain_xcrun_rv + ) + if(_sphinxtrain_xcrun_rv EQUAL 0 AND _sphinxtrain_osx_sdk AND EXISTS "${_sphinxtrain_osx_sdk}") + set(CMAKE_OSX_SYSROOT "${_sphinxtrain_osx_sdk}" CACHE PATH "macOS SDK path" FORCE) + message(STATUS "CMAKE_OSX_SYSROOT -> ${CMAKE_OSX_SYSROOT}") + endif() + endif() +endif() + project(SphinxTrain VERSION 5.0.0 DESCRIPTION "CMU Sphinx Trainer" HOMEPAGE_URL "https://github.com/cmusphinx/sphinxtrain") @@ -66,6 +89,71 @@ set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) configure_file(config.h.in config.h) add_definitions(-DHAVE_CONFIG_H) +# FindBLAS / FindLAPACK cache absolute paths under the SDK; after an Xcode upgrade those +# files may move (e.g. .../MacOSX26.1.sdk/...) while CMAKE_OSX_SYSROOT was fixed above. +# Also drop paths that still exist but live under a different SDK tree than CMAKE_OSX_SYSROOT. +if(APPLE) + set(_sphinxtrain_apple_libcache_vars + BLAS_Accelerate_LIBRARY + BLAS_blas_LIBRARY + BLAS_LIBRARY + LAPACK_Accelerate_LIBRARY + LAPACK_lapack_LIBRARY + LAPACK_LIBRARY + MATH_LIBRARY + ) + foreach(_sphinxtrain_v IN LISTS _sphinxtrain_apple_libcache_vars) + if(${_sphinxtrain_v}) + set(_sphinxtrain_p "${${_sphinxtrain_v}}") + if(_sphinxtrain_p MATCHES "^/") + set(_sphinxtrain_drop FALSE) + if(NOT EXISTS "${_sphinxtrain_p}") + set(_sphinxtrain_drop TRUE) + elseif(CMAKE_OSX_SYSROOT AND _sphinxtrain_p MATCHES "/[^/]+\\.sdk/") + string(LENGTH "${CMAKE_OSX_SYSROOT}" _sphinxtrain_sl) + if(_sphinxtrain_sl GREATER 0) + string(SUBSTRING "${_sphinxtrain_p}" 0 ${_sphinxtrain_sl} _sphinxtrain_head) + if(NOT _sphinxtrain_head STREQUAL CMAKE_OSX_SYSROOT) + set(_sphinxtrain_drop TRUE) + endif() + endif() + endif() + if(_sphinxtrain_drop) + unset(${_sphinxtrain_v} CACHE) + endif() + endif() + endif() + endforeach() + foreach(_sphinxtrain_listvar BLAS_LIBRARIES LAPACK_LIBRARIES) + if(${_sphinxtrain_listvar}) + set(_sphinxtrain_drop_libs FALSE) + foreach(_sphinxtrain_item IN LISTS ${_sphinxtrain_listvar}) + if(_sphinxtrain_item MATCHES "^/") + if(NOT EXISTS "${_sphinxtrain_item}") + set(_sphinxtrain_drop_libs TRUE) + elseif(CMAKE_OSX_SYSROOT AND _sphinxtrain_item MATCHES "/[^/]+\\.sdk/") + string(LENGTH "${CMAKE_OSX_SYSROOT}" _sphinxtrain_sl) + if(_sphinxtrain_sl GREATER 0) + string(SUBSTRING "${_sphinxtrain_item}" 0 ${_sphinxtrain_sl} _sphinxtrain_head) + if(NOT _sphinxtrain_head STREQUAL CMAKE_OSX_SYSROOT) + set(_sphinxtrain_drop_libs TRUE) + endif() + endif() + endif() + endif() + endforeach() + if(_sphinxtrain_drop_libs) + unset(${_sphinxtrain_listvar} CACHE) + if(_sphinxtrain_listvar STREQUAL "BLAS_LIBRARIES") + unset(BLAS_FOUND CACHE) + else() + unset(LAPACK_FOUND CACHE) + endif() + endif() + endif() + endforeach() +endif() + # Compile all the things add_subdirectory(src) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6a80637b..f86b3a36 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,7 +89,6 @@ libs/libsphinxbase/util/f2c_lite.c ) find_package(BLAS) find_package(LAPACK) -find_library(MATH_LIBRARY m) if(NOT LAPACK_FOUND) message("System-wide LAPACK not found, will use internal version") add_library(sphinxtrain ${SRCS} ${LAPACK_SRCS}) From f22fb55978fdf154dc95e770f3342bb0def44bf8 Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 13:52:29 -0400 Subject: [PATCH 02/12] sphinx3_align: handle align_frame failure without aborting Return -1 from align_frame on bad alignment instead of asserting. On failure, emit a reference outsent line when possible and continue. Log how many outsent fallbacks were written at the end of a run. --- src/programs/sphinx3_align/main_align.c | 57 +++++++++++++++++++++++-- src/programs/sphinx3_align/s3_align.c | 7 ++- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/src/programs/sphinx3_align/main_align.c b/src/programs/sphinx3_align/main_align.c index 100e5ab0..52ae4f5b 100644 --- a/src/programs/sphinx3_align/main_align.c +++ b/src/programs/sphinx3_align/main_align.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -174,6 +175,8 @@ static FILE *outctlfp = NULL; static const char *sentfile; static FILE *sentfp = NULL; +static int32 n_align_outsent_fallback; + static char *s2stsegdir = NULL; static char *stsegdir = NULL; static char *phsegdir = NULL; @@ -659,6 +662,37 @@ write_outctl(FILE * fp, char *uttctl) fflush(fp); } +static void +trim_spaces(char *s) +{ + char *p; + char *end; + + if (!s || !*s) + return; + for (p = s; *p && isspace((unsigned char) *p); p++); + if (p != s) + memmove(s, p, strlen(p) + 1); + end = s + strlen(s); + while (end > s && isspace((unsigned char) end[-1])) + *--end = '\0'; +} + +/** + * Emit one -outsent line from the reference (-insent) text when Viterbi alignment fails, + * so downstream jobs stay line-aligned with -ctl (one line per utterance). + */ +static void +write_outsent_insent_fallback(FILE * fp, char *sent, const char *uttid) +{ + if (!fp) + return; + trim_spaces(sent); + fprintf(fp, "%s (%s)\n", sent, uttid); + fflush(fp); + ++n_align_outsent_fallback; + E_INFO("Wrote -outsent fallback (reference text) for %s\n", uttid); +} /* @@ -681,6 +715,7 @@ align_utt(char *sent, /* In: Reference transcript */ if (nfr <= (w << 1)) { E_ERROR("Utterance %s < %d frames (%d); ignored\n", uttid, (w << 1) + 1, nfr); + write_outsent_insent_fallback(outsentfp, sent, uttid); return; } @@ -698,7 +733,7 @@ align_utt(char *sent, /* In: Reference transcript */ ptmr_stop(timers + tmr_utt); E_ERROR("No sentence HMM; no alignment for %s\n", uttid); - + write_outsent_insent_fallback(outsentfp, sent, uttid); return; } @@ -751,7 +786,16 @@ align_utt(char *sent, /* In: Reference transcript */ /* Step alignment one frame forward */ ptmr_start(timers + tmr_align); - align_frame(ascr->senscr); + if (align_frame(ascr->senscr) < 0) { + ptmr_stop(timers + tmr_align); + ptmr_stop(timers + tmr_utt); + ptmr_stop(&tm_utt); + ptmr_stop(&tm_ovrhd); + E_ERROR("Alignment failed mid-utterance for %s\n", uttid); + write_outsent_insent_fallback(outsentfp, sent, uttid); + align_destroy_sent_hmm(); + return; + } ptmr_stop(timers + tmr_align); ptmr_stop(timers + tmr_utt); } @@ -761,8 +805,10 @@ align_utt(char *sent, /* In: Reference transcript */ printf("\n"); /* Wind up alignment for this utterance */ - if (align_end_utt(&stseg, &phseg, &wdseg) < 0) + if (align_end_utt(&stseg, &phseg, &wdseg) < 0) { E_ERROR("Final state not reached; no alignment for %s\n\n", uttid); + write_outsent_insent_fallback(outsentfp, sent, uttid); + } else { if (s2stsegdir) write_s2stseg(s2stsegdir, stseg, uttid, ctlspec, cmd_ln_boolean_r(kbc->config, "-s2cdsen")); @@ -914,6 +960,7 @@ utt_align(void *data, utt_res_t * ur, int32 sf, int32 ef, char *uttid) ("Utt %s: Input file read (%s) with extension (%s) failed \n", uttid, ur->uttfile, cepext); } + write_outsent_insent_fallback(outsentfp, sent, uttid); } else { E_INFO("%s: %d input frames\n", uttid, nfr); @@ -1039,6 +1086,10 @@ main(int32 argc, char *argv[]) tm_utt.t_tot_elapsed / (tot_nfr * 0.01)); } + if (n_align_outsent_fallback > 0) + E_INFO("Utterances with -outsent reference fallback (alignment skipped): %d\n", + n_align_outsent_fallback); + if (outsentfp) fclose(outsentfp); if (outctlfp) diff --git a/src/programs/sphinx3_align/s3_align.c b/src/programs/sphinx3_align/s3_align.c index 9bb7d594..e33bc108 100644 --- a/src/programs/sphinx3_align/s3_align.c +++ b/src/programs/sphinx3_align/s3_align.c @@ -1078,7 +1078,12 @@ align_frame(int32 * senscr) } } } - assert(tmpbest > (int32) 0x80000000); + if (tmpbest <= (int32) 0x80000000) { + E_ERROR + ("No active predecessor for aligner state (frame %d); try a wider -beam (e.g. 1e-308)\n", + curfrm); + return -1; + } s->newscore = tmpbest + senscr[s->sen]; s->newhist = tmphist; From 2272750f4f82da595da5f63b6184c388b3aec2e3 Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 13:52:32 -0400 Subject: [PATCH 03/12] Add multipron force alignment and CFG_MULTIPRON training hook Add CFG_MULTIPRON to sphinx_train.cfg: when yes, Baum-Welch and related steps use BASE_DIR/multipron_align/EXPT.multipron.transcription. Extend GetLists and the untied-HMM, lattice, and BW slaves accordingly. Add multipron_align.pl and multipron_align.py to merge dicts, run sphinx3_align, and write that transcript (beam from CFG_FORCE_ALIGN_BEAM or default 1e-308). --- etc/sphinx_train.cfg | 8 + scripts/11.force_align/multipron_align.pl | 34 ++ scripts/11.force_align/multipron_align.py | 349 ++++++++++++++++++ scripts/30.cd_hmm_untied/slave_convg.pl | 4 + scripts/60.lattice_generation/slave_genlat.pl | 4 + scripts/65.mmie_train/baum_welch.pl | 4 + scripts/lib/SphinxTrain/Util.pm | 4 + templates/librispeech/etc/sphinx_train.cfg | 8 + 8 files changed, 415 insertions(+) create mode 100644 scripts/11.force_align/multipron_align.pl create mode 100644 scripts/11.force_align/multipron_align.py diff --git a/etc/sphinx_train.cfg b/etc/sphinx_train.cfg index fc566527..52f6b01c 100644 --- a/etc/sphinx_train.cfg +++ b/etc/sphinx_train.cfg @@ -196,6 +196,14 @@ $CFG_FORCE_ALIGN_MODELDIR = "$CFG_MODEL_DIR/$CFG_EXPTNAME.falign_ci_$CFG_DIRLABE # rejected for bad alignment. $CFG_FORCE_ALIGN_BEAM = 1e-60; +# Multipron alignment (sphinx3_align): set to yes after you run +# perl $CFG_SPHINXTRAIN_DIR/scripts/11.force_align/multipron_align.pl +# so Baum-Welch and related steps use +# $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription +# Keep $CFG_FORCEDALIGN = no unless you also run stock 11.force_align. Alignment search width +# for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts). +$CFG_MULTIPRON = 'no'; + # Calculate an LDA/MLLT transform? $CFG_LDA_MLLT = 'no'; # Dimensionality of LDA/MLLT output diff --git a/scripts/11.force_align/multipron_align.pl b/scripts/11.force_align/multipron_align.pl new file mode 100644 index 00000000..62e4fe5b --- /dev/null +++ b/scripts/11.force_align/multipron_align.pl @@ -0,0 +1,34 @@ +#!/usr/bin/env perl +## Run multipron_align.py using paths from sphinx_train.cfg (project base = $CFG_BASE_DIR). +## +## Usage (from training project directory, same as sphinxtrain run): +## perl path/to/scripts/11.force_align/multipron_align.pl [--dry-run] [--first-n N] [--binary PATH] +## +## See $CFG_MULTIPRON in etc/sphinx_train.cfg (set to yes after alignment for training). + +use strict; +use warnings; +use File::Basename qw(dirname); +use File::Spec::Functions qw(catfile updir); + +use lib catfile(dirname($0), updir(), 'lib'); +use SphinxTrain::Config; + +my @args = @ARGV; +my $dry = (@args && $args[0] eq '--dry-run') ? shift @args : 0; + +my $etc = catfile($ST::CFG_BASE_DIR, 'etc'); +if (!-d $etc) { + die "Missing directory $etc (run from project after setup, or fix \$CFG_BASE_DIR)\n"; +} + +my $py = catfile($ST::CFG_SPHINXTRAIN_DIR, 'scripts', '11.force_align', 'multipron_align.py'); +if (!-f $py) { + die "Missing $py\n"; +} + +my @py = ('python', $py); +push @py, '--dry-run' if $dry; +push @py, $etc, @args; +exec @py; +die "exec failed: $!\n"; diff --git a/scripts/11.force_align/multipron_align.py b/scripts/11.force_align/multipron_align.py new file mode 100644 index 00000000..9aa4efdb --- /dev/null +++ b/scripts/11.force_align/multipron_align.py @@ -0,0 +1,349 @@ +#!/usr/bin/env python +""" +Force-align the training set with sphinx3_align and write SphinxTrain-style transcripts +that reflect the Viterbi-best pronunciation variants (dict word strings from -outsent). + +This mirrors scripts/11.force_align/slave_align.pl phases 3–4 (dictionary merge + aligninput) +and scripts/11.force_align/force_align.pl (sphinx3_align invocation), without running the +full SphinxTrain ``sphinxtrain run`` pipeline. + +Training layout comes from ``etc/sphinx_train.cfg`` (``CFG_BASE_DIR``, ``CFG_EXPTNAME``, +and the usual align inputs). Outputs go under ``$CFG_BASE_DIR/multipron_align/``. Set +``CFG_MULTIPRON`` to ``yes`` in that file after alignment so Baum-Welch uses the generated +transcript. Optional: ``CFG_SPHINX3_ALIGN_BINARY`` if ``sphinx3_align`` is not under +``CFG_BIN_DIR``. Beam width follows ``CFG_FORCE_ALIGN_BEAM``, or ``--beam``, defaulting to +``1e-308`` if unset. + +Usage (from project base, after ``sphinxtrain -t TASK setup``):: + + perl scripts/11.force_align/multipron_align.pl [--dry-run] [--first-n N] [--binary PATH] + +Or:: + + python scripts/11.force_align/multipron_align.py [--dry-run] \\ + [--binary PATH] [--out-dir DIR] [--first-n N] [--beam W] + +Environment: + SPHINX3_ALIGN overrides the configured sphinx3_align path when set. +""" + +from __future__ import annotations + +import os +import re +import subprocess +import sys +from pathlib import Path + +# One-line Perl assignments: $CFG_NAME = value; +_ASSIGN = re.compile( + r"^\s*\$((?:CFG|ST)[A-Z0-9_]*)\s*=\s*(.*?)\s*;\s*(?:#.*)?$", + re.MULTILINE, +) + + +def _parse_sphinx_train_cfg(text: str) -> dict[str, str]: + raw: dict[str, str] = {} + for m in _ASSIGN.finditer(text): + name, val = m.group(1), m.group(2).strip() + if val.startswith(("'", '"')) and len(val) >= 2 and val[-1] == val[0]: + raw[name] = val[1:-1] + else: + raw[name] = val.rstrip(";").strip() + out = dict(raw) + for _ in range(24): + changed = False + for k, v in list(out.items()): + if "$" not in v: + continue + nv = v + for name, repl in out.items(): + nv = nv.replace(f"${{{name}}}", repl) + nv = nv.replace(f"${name}", repl) + if nv != v: + out[k] = nv + changed = True + if not changed: + break + return out + + +def _load_cfg(etc: Path) -> dict[str, str]: + cfg = etc / "sphinx_train.cfg" + if not cfg.is_file(): + print(f"Missing {cfg}", file=sys.stderr) + sys.exit(1) + return _parse_sphinx_train_cfg(cfg.read_text(encoding="utf-8", errors="replace")) + + +def _build_falign_dicts( + dictionary: Path, + filler: Path, + out_dict: Path, + out_fdict: Path, +) -> dict[str, str]: + """Replicate slave_align.pl phase 3: SIL-only fdict; merge other fillers into main dict.""" + silences: dict[str, str] = {} + fillers: dict[str, str] = {} + fdict_sil_lines: list[str] = [] + filler_text = filler.read_text(encoding="utf-8", errors="replace").splitlines() + for line in filler_text: + line = line.strip() + if not line or line.startswith("#"): + continue + parts = line.split() + word = parts[0] + phones = parts[1:] + if len(phones) == 1 and re.search(r"^SIL[be]?$", phones[0], re.I): + silences[word] = phones[0] + fdict_sil_lines.append(f"{word}\t{phones[0]}\n") + else: + fillers[word] = " ".join(phones) + out_fdict.write_text("".join(fdict_sil_lines), encoding="utf-8") + merged = dictionary.read_text(encoding="utf-8", errors="replace") + for k in sorted(fillers): + merged += f"{k}\t{fillers[k]}\n" + out_dict.write_text(merged, encoding="utf-8") + return silences + + +def _make_aligninput( + transcript: Path, + out_aligninput: Path, + silences: dict[str, str], + strip_variant_digits: bool, +) -> None: + """Replicate slave_align.pl phase 4 (silence removal + optional variant stripping).""" + lines_out: list[str] = [] + for line in transcript.read_text(encoding="utf-8", errors="replace").splitlines(): + s = line.rstrip("\n") + for sil_word in silences: + s = re.sub(rf"(^|\s){re.escape(sil_word)}(\s|$)", r"\1\2", s) + if strip_variant_digits: + s = re.sub(r"\(\d+\)", "", s) + s = s.strip() + s = re.sub(r"\s+", " ", s) + lines_out.append(s + "\n") + out_aligninput.write_text("".join(lines_out), encoding="utf-8") + + +def _outsent_to_transcription(outsent: Path, out_transcription: Path) -> None: + """sphinx3_align -outsent -> SphinxTrain `` ... (uttid)`` (single wrap).""" + lines: list[str] = [] + for line in outsent.read_text(encoding="utf-8", errors="replace").splitlines(): + line = line.strip() + if not line: + continue + m = re.search(r"\(([^()]+)\)\s*$", line) + if not m: + print(f"Skipping unparsable outsent line: {line!r}", file=sys.stderr) + continue + uttid = m.group(1).strip() + words = line[: m.start()].strip().split() + while words and words[0] == "": + words = words[1:] + while words and words[-1] == "": + words = words[:-1] + inner = " ".join(words) + if not inner: + lines.append(f" ({uttid})\n") + continue + lines.append(f" {inner} ({uttid})\n") + out_transcription.write_text("".join(lines), encoding="utf-8") + + +def _write_skipped_utts_from_log(log_file: Path, out_path: Path) -> None: + if not log_file.is_file(): + return + text = log_file.read_text(encoding="utf-8", errors="replace") + ids: list[str] = [] + for line in text.splitlines(): + if "Wrote -outsent fallback" not in line: + continue + m = re.search(r"for (\S+)\s*$", line) + if m: + ids.append(m.group(1)) + if not ids: + return + out_path.write_text("".join(f"{u}\n" for u in ids), encoding="utf-8") + print(f"Wrote skipped-utterance list ({len(ids)}): {out_path}") + + +def main() -> int: + argv = sys.argv[1:] + dry_run = False + if argv and argv[0] == "--dry-run": + dry_run = True + argv = argv[1:] + if not argv: + print(__doc__, file=sys.stderr) + return 2 + etc = Path(argv[0]).resolve() + bin_override: str | None = None + out_dir: Path | None = None + first_n: int | None = None + beam_override: str | None = None + rest = argv[1:] + i = 0 + while i < len(rest): + if rest[i] == "--binary" and i + 1 < len(rest): + bin_override = rest[i + 1] + i += 2 + continue + if rest[i] == "--out-dir" and i + 1 < len(rest): + out_dir = Path(rest[i + 1]).resolve() + i += 2 + continue + if rest[i] == "--first-n" and i + 1 < len(rest): + first_n = int(rest[i + 1]) + i += 2 + continue + if rest[i] == "--beam" and i + 1 < len(rest): + beam_override = rest[i + 1] + i += 2 + continue + print(f"Unexpected argument: {rest[i]}", file=sys.stderr) + return 2 + cfg = _load_cfg(etc) + base_dir = Path(cfg["CFG_BASE_DIR"]) + expt = cfg["CFG_EXPTNAME"] + out_root = out_dir if out_dir is not None else base_dir / "multipron_align" + + align_bin = ( + bin_override + or os.environ.get("SPHINX3_ALIGN") + or cfg.get("CFG_SPHINX3_ALIGN_BINARY") + or str(Path(cfg["CFG_BIN_DIR"]) / "sphinx3_align") + ) + align_path = Path(align_bin) + if not dry_run and not align_path.is_file(): + print( + f"sphinx3_align not found at {align_path} " + "(build the sphinx3_align target, set CFG_SPHINX3_ALIGN_BINARY in the project " + "cfg, or set SPHINX3_ALIGN / --binary).", + file=sys.stderr, + ) + return 1 + + dictionary = Path(cfg["CFG_DICTIONARY"]) + filler = Path(cfg["CFG_FILLERDICT"]) + listoffiles = Path(cfg["CFG_LISTOFFILES"]) + transcript = Path(cfg["CFG_TRANSCRIPTFILE"]) + ctlcount = "1000000" + feat_dir = Path(cfg["CFG_FEATFILES_DIR"]) + feat_ext = "." + cfg["CFG_FEATFILE_EXTENSION"].lstrip(".") + hmm_dir = Path(cfg["CFG_MODEL_DIR"]) / f"{expt}.ci_{cfg['CFG_DIRLABEL']}" + + if not hmm_dir.is_dir(): + print(f"Missing HMM directory {hmm_dir} (train CI models first).", file=sys.stderr) + return 1 + if not feat_dir.is_dir(): + print(f"Missing features directory {feat_dir} (run feature extraction).", file=sys.stderr) + return 1 + if not transcript.is_file(): + print(f"Missing transcript {transcript}", file=sys.stderr) + return 1 + + out_transcription = out_root / f"{expt}.multipron.transcription" + + falign_dict = out_root / f"{expt}.falign.dict" + falign_fdict = out_root / f"{expt}.falign.fdict" + aligninput = out_root / f"{expt}.aligninput" + ctl_path = listoffiles + if first_n is not None and first_n > 0: + ctl_path = out_root / f"{expt}.multipron.shorter.fileids" + ctlcount = str(first_n) + outsent = out_root / f"{expt}.multipron.outsent" + log_file = out_root / f"{expt}.multipron_align.log" + + # sphinx3_align: -beam is a linear probability passed to logs3(); smaller p => + # wider Viterbi pruning (see s3_align.c). Default 1e-308 is effectively full width. + beam = beam_override or cfg.get("CFG_FORCE_ALIGN_BEAM") or "1e-308" + statepdeffn = cfg["CFG_HMM_TYPE"] + mwfloor = "1e-8" + minvar = "1e-4" + + args = [ + str(align_path), + "-hmm", + str(hmm_dir), + "-senmgau", + statepdeffn, + "-mixwfloor", + mwfloor, + "-varfloor", + minvar, + "-dict", + str(falign_dict), + "-fdict", + str(falign_fdict), + "-ctl", + str(ctl_path), + "-ctloffset", + "0", + "-ctlcount", + ctlcount, + "-cepdir", + str(feat_dir), + "-cepext", + feat_ext, + "-insent", + str(aligninput), + "-outsent", + str(outsent), + "-beam", + beam, + "-agc", + cfg["CFG_AGC"], + "-cmn", + cfg["CFG_CMN"], + "-varnorm", + cfg["CFG_VARNORM"], + "-feat", + cfg["CFG_FEATURE"], + "-ceplen", + cfg["CFG_VECTOR_LENGTH"], + "-insert_sil", + "1", + ] + + print("Doing multipron force alignment (sphinx3_align)...") + if dry_run: + print("Dry run: would create dicts and aligninput under:\n ", out_root) + print("Dry run: would run:\n ", " ".join(args)) + print(f"Would write transcripts to {out_transcription}") + return 0 + + out_root.mkdir(parents=True, exist_ok=True) + print(f"Writing merged dicts under {out_root}...") + silences = _build_falign_dicts(dictionary, filler, falign_dict, falign_fdict) + _make_aligninput(transcript, aligninput, silences, strip_variant_digits=True) + if first_n is not None and first_n > 0: + ids = listoffiles.read_text(encoding="utf-8", errors="replace").splitlines() + ins_lines = aligninput.read_text(encoding="utf-8", errors="replace").splitlines() + n = min(first_n, len(ids), len(ins_lines)) + ctl_path.write_text("\n".join(ids[:n]) + "\n", encoding="utf-8") + aligninput.write_text("\n".join(ins_lines[:n]) + "\n", encoding="utf-8") + print(f"Limited run to first {n} utterances (--first-n).") + + log_file.write_text( + "Command:\n" + " ".join(args) + "\n\n", encoding="utf-8" + ) + with log_file.open("ab") as logf: + proc = subprocess.run(args, stdout=logf, stderr=subprocess.STDOUT) + if proc.returncode != 0: + print(f"sphinx3_align failed (see {log_file})", file=sys.stderr) + return proc.returncode + if not outsent.is_file(): + print(f"Expected output missing: {outsent}", file=sys.stderr) + return 1 + _outsent_to_transcription(outsent, out_transcription) + skipped_utts = out_root / f"{expt}.multipron.skipped_utts" + _write_skipped_utts_from_log(log_file, skipped_utts) + print(f"Wrote {outsent}") + print(f"Wrote {out_transcription}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/30.cd_hmm_untied/slave_convg.pl b/scripts/30.cd_hmm_untied/slave_convg.pl index 2a9d8092..03f77abe 100644 --- a/scripts/30.cd_hmm_untied/slave_convg.pl +++ b/scripts/30.cd_hmm_untied/slave_convg.pl @@ -144,6 +144,10 @@ () if ($ST::CFG_FORCEDALIGN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts"; + } elsif (defined($ST::CFG_MULTIPRON) + && $ST::CFG_MULTIPRON eq "yes") { + $listoffiles = $ST::CFG_LISTOFFILES; + $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription"; } elsif ($ST::CFG_VTLN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts"; diff --git a/scripts/60.lattice_generation/slave_genlat.pl b/scripts/60.lattice_generation/slave_genlat.pl index 4ffef05c..f803796b 100644 --- a/scripts/60.lattice_generation/slave_genlat.pl +++ b/scripts/60.lattice_generation/slave_genlat.pl @@ -71,6 +71,10 @@ if ($ST::CFG_FORCEDALIGN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts"; +} elsif (defined($ST::CFG_MULTIPRON) + && $ST::CFG_MULTIPRON eq "yes") { + $listoffiles = $ST::CFG_LISTOFFILES; + $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription"; } elsif ($ST::CFG_VTLN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts"; diff --git a/scripts/65.mmie_train/baum_welch.pl b/scripts/65.mmie_train/baum_welch.pl index b81fe39d..f0f5da56 100644 --- a/scripts/65.mmie_train/baum_welch.pl +++ b/scripts/65.mmie_train/baum_welch.pl @@ -101,6 +101,10 @@ } elsif ($ST::CFG_FORCEDALIGN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts"; +} elsif (defined($ST::CFG_MULTIPRON) + && $ST::CFG_MULTIPRON eq "yes") { + $listoffiles = $ST::CFG_LISTOFFILES; + $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription"; } elsif ($ST::CFG_VTLN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts"; diff --git a/scripts/lib/SphinxTrain/Util.pm b/scripts/lib/SphinxTrain/Util.pm index a530d711..a1f3ff7a 100644 --- a/scripts/lib/SphinxTrain/Util.pm +++ b/scripts/lib/SphinxTrain/Util.pm @@ -634,6 +634,10 @@ sub GetLists { } elsif ($ST::CFG_FORCEDALIGN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts"; + } elsif (defined($ST::CFG_MULTIPRON) + && $ST::CFG_MULTIPRON eq "yes") { + $listoffiles = $ST::CFG_LISTOFFILES; + $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription"; } else { $listoffiles = $ST::CFG_LISTOFFILES; $transcriptfile = $ST::CFG_TRANSCRIPTFILE; diff --git a/templates/librispeech/etc/sphinx_train.cfg b/templates/librispeech/etc/sphinx_train.cfg index 2f50e071..00dbb921 100644 --- a/templates/librispeech/etc/sphinx_train.cfg +++ b/templates/librispeech/etc/sphinx_train.cfg @@ -196,6 +196,14 @@ $CFG_FORCE_ALIGN_MODELDIR = "$CFG_MODEL_DIR/$CFG_EXPTNAME.falign_ci_$CFG_DIRLABE # rejected for bad alignment. $CFG_FORCE_ALIGN_BEAM = 1e-60; +# Multipron alignment (sphinx3_align): set to yes after you run +# perl $CFG_SPHINXTRAIN_DIR/scripts/11.force_align/multipron_align.pl +# so Baum-Welch and related steps use +# $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription +# Keep $CFG_FORCEDALIGN = no unless you also run stock 11.force_align. Alignment search width +# for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts). +$CFG_MULTIPRON = 'no'; + # Calculate an LDA/MLLT transform? $CFG_LDA_MLLT = 'no'; # Dimensionality of LDA/MLLT output From b0a23541c6f026667eb2e934576edebb5da8981f Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 13:52:36 -0400 Subject: [PATCH 04/12] Add vocab_dict filter and optional 00a.vocab_dict script Python reduces the dictionary to words in the transcript vocabulary while keeping pronunciation variants. Perl script is invoked from the optional 00a.vocab_dict Makefile step; uses python like other SphinxTrain drivers. Define CFG_VOCAB_DICT and CFG_VOCAB_DICTIONARY in sphinx_train.cfg. --- etc/sphinx_train.cfg | 4 ++ python/cmusphinx/vocab_dict.py | 83 ++++++++++++++++++++++ scripts/00a.vocab_dict/make_vocab_dict.pl | 78 ++++++++++++++++++++ templates/librispeech/etc/sphinx_train.cfg | 4 ++ 4 files changed, 169 insertions(+) create mode 100644 python/cmusphinx/vocab_dict.py create mode 100644 scripts/00a.vocab_dict/make_vocab_dict.pl diff --git a/etc/sphinx_train.cfg b/etc/sphinx_train.cfg index 52f6b01c..a1d712c0 100644 --- a/etc/sphinx_train.cfg +++ b/etc/sphinx_train.cfg @@ -204,6 +204,10 @@ $CFG_FORCE_ALIGN_BEAM = 1e-60; # for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts). $CFG_MULTIPRON = 'no'; +# Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl). +$CFG_VOCAB_DICT = 'no'; +$CFG_VOCAB_DICTIONARY = "$CFG_LIST_DIR/$CFG_EXPTNAME.vocab.dic"; + # Calculate an LDA/MLLT transform? $CFG_LDA_MLLT = 'no'; # Dimensionality of LDA/MLLT output diff --git a/python/cmusphinx/vocab_dict.py b/python/cmusphinx/vocab_dict.py new file mode 100644 index 00000000..18f10b31 --- /dev/null +++ b/python/cmusphinx/vocab_dict.py @@ -0,0 +1,83 @@ +# Copyright (c) 2009 Carnegie Mellon University +# +# You may copy and modify this freely under the same terms as +# Sphinx-III + +"""Filter dictionary to vocabulary from transcripts. + +Creates a reduced dictionary containing only words that appear in the +training/test transcripts, preserving all pronunciation variants. +""" + +import sys +from cmusphinx import s3dict + + +def load_vocab(vocab_path): + """Load vocabulary from file (one word per line).""" + vocab = set() + with open(vocab_path) as fh: + for line in fh: + word = line.strip() + if word: + vocab.add(word) + return vocab + + +def filter_dict(indict, vocab, outfh): + """Filter dictionary to vocabulary, write to outfh.""" + in_words = set(indict.words()) + kept = vocab & in_words + unused = in_words - vocab + + for w in sorted(kept): + for i, phones in enumerate(indict.alts(w), 1): + if i == 1: + outfh.write("%s %s\n" % (w, " ".join(phones))) + else: + outfh.write("%s(%d) %s\n" % (w, i, " ".join(phones))) + + return kept, unused + + +def main(): + if len(sys.argv) < 4: + print("Usage: %s DICT VOCAB OUTDICT" % sys.argv[0], file=sys.stderr) + sys.exit(1) + + dict_path, vocab_path, out_path = sys.argv[1:4] + + vocab = load_vocab(vocab_path) + indict = s3dict.open(dict_path) + in_words = set(indict.words()) + + with open(out_path, "w") as outfh: + kept, unused = filter_dict(indict, vocab, outfh) + + # Stats + full_prons = sum(indict.maxalt[w] for w in in_words) + kept_prons = sum(indict.maxalt[w] for w in kept) + + print("Full dictionary: %7d words" % len(in_words)) + print("Transcript vocab: %7d words" % len(vocab)) + print("Kept in reduced: %7d words" % len(kept)) + print("Removed (unused): %7d words" % len(unused)) + print("Full pronunciations: %7d" % full_prons) + print("Kept pronunciations: %7d" % kept_prons) + + # Write unused words for tracing + unused_path = out_path.replace(".dic", ".unused") + with open(unused_path, "w") as fh: + for w in sorted(unused): + fh.write("%s\n" % w) + print("Unused words: %s" % unused_path) + + # Warn about OOV + oov = vocab - in_words + if oov: + print("OOV words: %7d (missing from dict)" % len(oov), + file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/scripts/00a.vocab_dict/make_vocab_dict.pl b/scripts/00a.vocab_dict/make_vocab_dict.pl new file mode 100644 index 00000000..19f36e42 --- /dev/null +++ b/scripts/00a.vocab_dict/make_vocab_dict.pl @@ -0,0 +1,78 @@ +#!/usr/bin/perl +## ==================================================================== +## +## Copyright (c) 1996-2000 Carnegie Mellon University. All rights +## reserved. +## +## Redistribution and use in source and binary forms, with or without +## modification, are permitted provided that the following conditions +## are met: +## +## 1. Redistributions of source code must retain the above copyright +## notice, this list of conditions and the following disclaimer. +## +## 2. Redistributions in binary form must reproduce the above copyright +## notice, this list of conditions and the following disclaimer in +## the documentation and/or other materials provided with the +## distribution. +## +## This work was supported in part by funding from the Defense Advanced +## Research Projects Agency and the National Science Foundation of the +## United States of America, and the CMU Sphinx Speech Consortium. +## +## THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND +## ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +## THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +## PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY +## NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +## +## ==================================================================== + +use strict; +use File::Basename; +use File::Spec::Functions; + +use lib catdir(dirname($0), updir(), 'lib'); +use SphinxTrain::Config; +use SphinxTrain::Util; + +$| = 1; # Turn on autoflushing +Log("MODULE: 00a Create vocabulary-restricted dictionary\n"); + +if ($ST::CFG_VOCAB_DICT ne "yes") { + Log("Skipped (set \$CFG_VOCAB_DICT = 'yes' to enable)\n"); + exit(0); +} + +my $logdir = "$ST::CFG_LOG_DIR/00a.vocab_dict"; +mkdir($logdir, 0777) unless -d $logdir; +my $logfile = "$logdir/$ST::CFG_EXPTNAME.vocab_dict.log"; + +my $vocabfile = "$ST::CFG_LIST_DIR/$ST::CFG_EXPTNAME.vocab"; +my $outdict = $ST::CFG_VOCAB_DICTIONARY; + +unless (-f $vocabfile) { + LogError("Vocabulary file not found: $vocabfile\n"); + LogError("Run 00.verify first to generate it\n"); + exit(1); +} + +Log("Creating vocabulary-restricted dictionary\n"); +Log(" Input dict: $ST::CFG_DICTIONARY\n"); +Log(" Vocabulary: $vocabfile\n"); +Log(" Output dict: $outdict\n"); + +$ENV{PYTHONPATH} .= ':' . catdir($ST::CFG_SPHINXTRAIN_DIR, 'python'); +my $rv = RunTool("python", $logfile, 0, + "-m", "cmusphinx.vocab_dict", + $ST::CFG_DICTIONARY, + $vocabfile, + $outdict); + +exit($rv); diff --git a/templates/librispeech/etc/sphinx_train.cfg b/templates/librispeech/etc/sphinx_train.cfg index 00dbb921..94272c84 100644 --- a/templates/librispeech/etc/sphinx_train.cfg +++ b/templates/librispeech/etc/sphinx_train.cfg @@ -204,6 +204,10 @@ $CFG_FORCE_ALIGN_BEAM = 1e-60; # for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts). $CFG_MULTIPRON = 'no'; +# Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl). +$CFG_VOCAB_DICT = 'no'; +$CFG_VOCAB_DICTIONARY = "$CFG_LIST_DIR/$CFG_EXPTNAME.vocab.dic"; + # Calculate an LDA/MLLT transform? $CFG_LDA_MLLT = 'no'; # Dimensionality of LDA/MLLT output From b9a8e32d08eb74a3944a7a36bc868d208196ecd0 Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 13:52:38 -0400 Subject: [PATCH 05/12] Gitignore docs/plan-multi-pronunciation-iterative-training.md Keeps optional local design notes out of the repository. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 21789886..7288ba2e 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ autom4te.cache include/stamp-h2 ylwrap build + +# Local design notes (not part of the distributed tree) +docs/plan-multi-pronunciation-iterative-training.md From dbaaa4446ea2c0afbfc04a9d9734174de17582fe Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 14:30:06 -0400 Subject: [PATCH 06/12] Multipron on by default: stage 21 after CI, CFG_MULTIPRON=no to disable Insert training stage 21.multipron_align after CI to run multipron_align.pl when CFG_MULTIPRON is set and not no. Use multipron transcript for CD and later only when the file exists (Util::ShouldUseMultipronTranscript). Default CFG_MULTIPRON to yes in sphinx_train.cfg; omit or unset variable keeps legacy behavior without stage 21. Stage 21 driver: import dirname for lib path; do not pass an extra etc path to multipron_align.pl. sphinxtrain decodes POSIX wait status from os.system() so failed Perl stages fail the shell on Unix. --- etc/sphinx_train.cfg | 14 +++---- scripts/11.force_align/multipron_align.pl | 2 +- scripts/11.force_align/multipron_align.py | 10 ++--- scripts/21.multipron_align/slave_align.pl | 41 +++++++++++++++++++ scripts/30.cd_hmm_untied/slave_convg.pl | 5 +-- scripts/60.lattice_generation/slave_genlat.pl | 5 +-- scripts/65.mmie_train/baum_welch.pl | 5 +-- scripts/lib/SphinxTrain/Util.pm | 20 +++++++-- scripts/sphinxtrain | 25 ++++++++--- templates/librispeech/etc/sphinx_train.cfg | 14 +++---- 10 files changed, 102 insertions(+), 39 deletions(-) create mode 100755 scripts/21.multipron_align/slave_align.pl diff --git a/etc/sphinx_train.cfg b/etc/sphinx_train.cfg index a1d712c0..a8bd6cb5 100644 --- a/etc/sphinx_train.cfg +++ b/etc/sphinx_train.cfg @@ -196,13 +196,13 @@ $CFG_FORCE_ALIGN_MODELDIR = "$CFG_MODEL_DIR/$CFG_EXPTNAME.falign_ci_$CFG_DIRLABE # rejected for bad alignment. $CFG_FORCE_ALIGN_BEAM = 1e-60; -# Multipron alignment (sphinx3_align): set to yes after you run -# perl $CFG_SPHINXTRAIN_DIR/scripts/11.force_align/multipron_align.pl -# so Baum-Welch and related steps use -# $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription -# Keep $CFG_FORCEDALIGN = no unless you also run stock 11.force_align. Alignment search width -# for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts). -$CFG_MULTIPRON = 'no'; +# Multipron: after CI, stage 21 runs multipron alignment (sphinx3_align); CD and later +# steps use $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription when present. +# If this variable is missing (old configs), multipron is off. Set to no to skip stage 21 +# and use only $CFG_TRANSCRIPTFILE everywhere. Requires +# sphinx3_align in $CFG_BIN_DIR. Keep $CFG_FORCEDALIGN = no unless you also run stock +# 11.force_align. Beam for multipron_align follows $CFG_FORCE_ALIGN_BEAM (else 1e-308). +$CFG_MULTIPRON = 'yes'; # Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl). $CFG_VOCAB_DICT = 'no'; diff --git a/scripts/11.force_align/multipron_align.pl b/scripts/11.force_align/multipron_align.pl index 62e4fe5b..cb513bbe 100644 --- a/scripts/11.force_align/multipron_align.pl +++ b/scripts/11.force_align/multipron_align.pl @@ -4,7 +4,7 @@ ## Usage (from training project directory, same as sphinxtrain run): ## perl path/to/scripts/11.force_align/multipron_align.pl [--dry-run] [--first-n N] [--binary PATH] ## -## See $CFG_MULTIPRON in etc/sphinx_train.cfg (set to yes after alignment for training). +## See $CFG_MULTIPRON in etc/sphinx_train.cfg (no disables multipron; stage 21 runs this after CI by default). use strict; use warnings; diff --git a/scripts/11.force_align/multipron_align.py b/scripts/11.force_align/multipron_align.py index 9aa4efdb..57a10b9c 100644 --- a/scripts/11.force_align/multipron_align.py +++ b/scripts/11.force_align/multipron_align.py @@ -8,11 +8,11 @@ full SphinxTrain ``sphinxtrain run`` pipeline. Training layout comes from ``etc/sphinx_train.cfg`` (``CFG_BASE_DIR``, ``CFG_EXPTNAME``, -and the usual align inputs). Outputs go under ``$CFG_BASE_DIR/multipron_align/``. Set -``CFG_MULTIPRON`` to ``yes`` in that file after alignment so Baum-Welch uses the generated -transcript. Optional: ``CFG_SPHINX3_ALIGN_BINARY`` if ``sphinx3_align`` is not under -``CFG_BIN_DIR``. Beam width follows ``CFG_FORCE_ALIGN_BEAM``, or ``--beam``, defaulting to -``1e-308`` if unset. +and the usual align inputs). Outputs go under ``$CFG_BASE_DIR/multipron_align/``. +Normally stage 21 runs this after CI when ``CFG_MULTIPRON`` is not ``no``. Set +``CFG_MULTIPRON`` to ``no`` to disable multipron entirely. Optional: +``CFG_SPHINX3_ALIGN_BINARY`` if ``sphinx3_align`` is not under ``CFG_BIN_DIR``. Beam width +follows ``CFG_FORCE_ALIGN_BEAM``, or ``--beam``, defaulting to ``1e-308`` if unset. Usage (from project base, after ``sphinxtrain -t TASK setup``):: diff --git a/scripts/21.multipron_align/slave_align.pl b/scripts/21.multipron_align/slave_align.pl new file mode 100755 index 00000000..956cefdc --- /dev/null +++ b/scripts/21.multipron_align/slave_align.pl @@ -0,0 +1,41 @@ +#!/usr/bin/env perl +## After CI HMM training, run multipron force alignment so later stages can use +## pronunciation-disambiguated transcripts. Skipped when $CFG_MULTIPRON is no. + +use strict; +use warnings; +use File::Basename qw(dirname); +use File::Spec::Functions qw(catfile catdir updir); + +use lib catdir(dirname($0), updir(), 'lib'); +use SphinxTrain::Config; +use SphinxTrain::Util; + +$| = 1; +Log("MODULE: 21 Multipron alignment (after CI)\n"); + +if (!defined($ST::CFG_MULTIPRON) || $ST::CFG_MULTIPRON eq "no") { + Log("Skipped (\$CFG_MULTIPRON unset or no)\n"); + exit 0; +} + +my $pl = catfile($ST::CFG_SPHINXTRAIN_DIR, qw(scripts 11.force_align multipron_align.pl)); +my $etc = catfile($ST::CFG_BASE_DIR, "etc"); +unless (-f $pl) { + LogError("Missing $pl\n"); + exit 1; +} +unless (-d $etc) { + LogError("Missing directory $etc\n"); + exit 1; +} + +my $logdir = "$ST::CFG_LOG_DIR/21.multipron_align"; +mkdir($logdir, 0777) unless -d $logdir; +my $logfile = "$logdir/$ST::CFG_EXPTNAME.multipron_align_wrap.log"; + +Log("Running multipron_align.pl (requires CI models under \$CFG_MODEL_DIR)\n"); +# Do not pass $etc on the command line: multipron_align.pl resolves etc/ from +# sphinx_train.cfg; an extra path is forwarded to multipron_align.py as a bogus arg. +my $rv = RunTool($^X, $logfile, 0, $pl); +exit($rv); diff --git a/scripts/30.cd_hmm_untied/slave_convg.pl b/scripts/30.cd_hmm_untied/slave_convg.pl index 03f77abe..0898c565 100644 --- a/scripts/30.cd_hmm_untied/slave_convg.pl +++ b/scripts/30.cd_hmm_untied/slave_convg.pl @@ -144,10 +144,9 @@ () if ($ST::CFG_FORCEDALIGN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts"; - } elsif (defined($ST::CFG_MULTIPRON) - && $ST::CFG_MULTIPRON eq "yes") { + } elsif (ShouldUseMultipronTranscript()) { $listoffiles = $ST::CFG_LISTOFFILES; - $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription"; + $transcriptfile = MultipronTranscriptFile(); } elsif ($ST::CFG_VTLN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts"; diff --git a/scripts/60.lattice_generation/slave_genlat.pl b/scripts/60.lattice_generation/slave_genlat.pl index f803796b..758e0bfd 100644 --- a/scripts/60.lattice_generation/slave_genlat.pl +++ b/scripts/60.lattice_generation/slave_genlat.pl @@ -71,10 +71,9 @@ if ($ST::CFG_FORCEDALIGN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts"; -} elsif (defined($ST::CFG_MULTIPRON) - && $ST::CFG_MULTIPRON eq "yes") { +} elsif (ShouldUseMultipronTranscript()) { $listoffiles = $ST::CFG_LISTOFFILES; - $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription"; + $transcriptfile = MultipronTranscriptFile(); } elsif ($ST::CFG_VTLN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts"; diff --git a/scripts/65.mmie_train/baum_welch.pl b/scripts/65.mmie_train/baum_welch.pl index f0f5da56..b98bce6d 100644 --- a/scripts/65.mmie_train/baum_welch.pl +++ b/scripts/65.mmie_train/baum_welch.pl @@ -101,10 +101,9 @@ } elsif ($ST::CFG_FORCEDALIGN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts"; -} elsif (defined($ST::CFG_MULTIPRON) - && $ST::CFG_MULTIPRON eq "yes") { +} elsif (ShouldUseMultipronTranscript()) { $listoffiles = $ST::CFG_LISTOFFILES; - $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription"; + $transcriptfile = MultipronTranscriptFile(); } elsif ($ST::CFG_VTLN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts"; diff --git a/scripts/lib/SphinxTrain/Util.pm b/scripts/lib/SphinxTrain/Util.pm index a1f3ff7a..ce08e4f1 100644 --- a/scripts/lib/SphinxTrain/Util.pm +++ b/scripts/lib/SphinxTrain/Util.pm @@ -45,7 +45,8 @@ use vars qw(@ISA @EXPORT); ImgSrc LogWarning LogError LogProgress LogStatus Converged RunTool SubstParams RunScript LaunchScript WaitForScript GetLists GetDict - WaitForConvergence TiedWaitForConvergence WaitForMMIEConverge Trim); + WaitForConvergence TiedWaitForConvergence WaitForMMIEConverge Trim + MultipronTranscriptFile ShouldUseMultipronTranscript); use Sys::Hostname; use File::Basename; @@ -621,6 +622,18 @@ sub GetDict { } } +sub MultipronTranscriptFile { + return "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription"; +} + +# Multipron is on unless CFG_MULTIPRON is explicitly no. Use the multipron transcript +# only after stage 21 has produced the file (after CI when multipron is enabled). +sub ShouldUseMultipronTranscript { + return 0 unless defined($ST::CFG_MULTIPRON); + return 0 if $ST::CFG_MULTIPRON eq "no"; + return -f MultipronTranscriptFile(); +} + sub GetLists { # aligned transcripts and the list of aligned files is obtained as a result # of (03.) forced alignment or (04.) VTLN @@ -634,10 +647,9 @@ sub GetLists { } elsif ($ST::CFG_FORCEDALIGN eq "yes") { $listoffiles = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles"; $transcriptfile = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts"; - } elsif (defined($ST::CFG_MULTIPRON) - && $ST::CFG_MULTIPRON eq "yes") { + } elsif (ShouldUseMultipronTranscript()) { $listoffiles = $ST::CFG_LISTOFFILES; - $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription"; + $transcriptfile = MultipronTranscriptFile(); } else { $listoffiles = $ST::CFG_LISTOFFILES; $transcriptfile = $ST::CFG_TRANSCRIPTFILE; diff --git a/scripts/sphinxtrain b/scripts/sphinxtrain index eaac4f4a..27c6d560 100755 --- a/scripts/sphinxtrain +++ b/scripts/sphinxtrain @@ -11,6 +11,15 @@ sphinxbinpath = "" sphinxpath = "" +def _perl_script_exit_code(status): + """Exit code from os.system() after running a child (POSIX wait-encoded status).""" + if os.name == "nt": + return status + if status == 0: + return 0 + return (status >> 8) & 0xFF + + def find_paths(): global training_basedir global sphinxbinpath @@ -83,6 +92,7 @@ steps = [ "11.force_align/slave_align.pl", "12.vtln_align/slave_align.pl", "20.ci_hmm/slave_convg.pl", + "21.multipron_align/slave_align.pl", "30.cd_hmm_untied/slave_convg.pl", "40.buildtrees/slave.treebuilder.pl", "45.prunetree/slave.state-tying.pl", @@ -103,8 +113,9 @@ def run_stages(stages): number = step.split("/")[0].split(".")[0] if name == stage or number == stage: ret = os.system("perl '" + sphinxpath + "/scripts/" + step + "'") - if ret != 0: - exit(ret) + ec = _perl_script_exit_code(ret) + if ec != 0: + sys.exit(ec) def run_from(stage): @@ -115,16 +126,18 @@ def run_from(stage): if name == stage or number == stage or found: found = True ret = os.system("perl '" + sphinxpath + "/scripts/" + step + "'") - if ret != 0: - exit(ret) + ec = _perl_script_exit_code(ret) + if ec != 0: + sys.exit(ec) def run(): print("Running the training") for step in steps: ret = os.system("perl '" + sphinxpath + "/scripts/" + step + "'") - if ret != 0: - exit(ret) + ec = _perl_script_exit_code(ret) + if ec != 0: + sys.exit(ec) def usage(): diff --git a/templates/librispeech/etc/sphinx_train.cfg b/templates/librispeech/etc/sphinx_train.cfg index 94272c84..e4f18332 100644 --- a/templates/librispeech/etc/sphinx_train.cfg +++ b/templates/librispeech/etc/sphinx_train.cfg @@ -196,13 +196,13 @@ $CFG_FORCE_ALIGN_MODELDIR = "$CFG_MODEL_DIR/$CFG_EXPTNAME.falign_ci_$CFG_DIRLABE # rejected for bad alignment. $CFG_FORCE_ALIGN_BEAM = 1e-60; -# Multipron alignment (sphinx3_align): set to yes after you run -# perl $CFG_SPHINXTRAIN_DIR/scripts/11.force_align/multipron_align.pl -# so Baum-Welch and related steps use -# $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription -# Keep $CFG_FORCEDALIGN = no unless you also run stock 11.force_align. Alignment search width -# for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts). -$CFG_MULTIPRON = 'no'; +# Multipron: after CI, stage 21 runs multipron alignment (sphinx3_align); CD and later +# steps use $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription when present. +# If this variable is missing (old configs), multipron is off. Set to no to skip stage 21 +# and use only $CFG_TRANSCRIPTFILE everywhere. Requires +# sphinx3_align in $CFG_BIN_DIR. Keep $CFG_FORCEDALIGN = no unless you also run stock +# 11.force_align. Beam for multipron_align follows $CFG_FORCE_ALIGN_BEAM (else 1e-308). +$CFG_MULTIPRON = 'yes'; # Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl). $CFG_VOCAB_DICT = 'no'; From a03926a163ac0aff1ca83d9894a7f2afc6641e55 Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 14:47:11 -0400 Subject: [PATCH 07/12] docs: document multipron stage 21 Describe default multipron alignment after CI, sphinx3_align, and how to disable via CFG_MULTIPRON. Fix Acknowledgments heading spelling. --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a9d23f3..f65e24ed 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,17 @@ You do not need to install SphinxTrain to run it, simply run training directory. Note that you do need to build and install PocketSphinx for evaluation to work properly, however. +Multipron alignment (optional stage 21) +---------------------------------------- + +After CI HMM training, the default configuration runs multipron force +alignment so pronunciation-disambiguated transcripts can be written +under `multipron_align/` in your project. This uses the `sphinx3_align` +program built with the rest of the tree (`cmake --build build`). + +Set `$CFG_MULTIPRON` to `no` in `etc/sphinx_train.cfg` if you want to +skip stage 21 and use only the original transcripts for later stages. + You can also install SphinxTrain system-wide if you so desire: sudo cmake --build build --target install @@ -138,7 +149,7 @@ procedure is identical to the Unix installation. Also, check the section title "All Platforms" above. -Acknowldegments +Acknowledgments --------------- The development of this code has included support at different times From 4feb5ae95a954b8d227f5ca0177f2007f9e2930c Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 14:52:30 -0400 Subject: [PATCH 08/12] Include new scripts in CMakeLists --- scripts/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt index c41b087e..b34c78aa 100644 --- a/scripts/CMakeLists.txt +++ b/scripts/CMakeLists.txt @@ -2,6 +2,7 @@ set(SCRIPTDIRS 0000.g2p_train 000.comp_feat 00.verify +00a.vocab_dict 01.lda_train 02.mllt_train 05.vector_quantize @@ -9,6 +10,7 @@ set(SCRIPTDIRS 11.force_align 12.vtln_align 20.ci_hmm +21.multipron_align 30.cd_hmm_untied 40.buildtrees 45.prunetree From 6f8e7a9a604d4f67b95022bb6737fe95415ffb9d Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 15:01:29 -0400 Subject: [PATCH 09/12] Fix CI --- scripts/11.force_align/multipron_align.pl | 7 ++++++- scripts/21.multipron_align/slave_align.pl | 12 ++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/scripts/11.force_align/multipron_align.pl b/scripts/11.force_align/multipron_align.pl index cb513bbe..e83c657e 100644 --- a/scripts/11.force_align/multipron_align.pl +++ b/scripts/11.force_align/multipron_align.pl @@ -27,7 +27,12 @@ die "Missing $py\n"; } -my @py = ('python', $py); +# Prefer $PYTHON; else python3 on Unix (many images have no `python` symlink); else python. +my $pyexe = $ENV{PYTHON}; +if (!defined($pyexe) || $pyexe eq '') { + $pyexe = ($^O eq 'MSWin32') ? 'python' : 'python3'; +} +my @py = ($pyexe, $py); push @py, '--dry-run' if $dry; push @py, $etc, @args; exec @py; diff --git a/scripts/21.multipron_align/slave_align.pl b/scripts/21.multipron_align/slave_align.pl index 956cefdc..605fd179 100755 --- a/scripts/21.multipron_align/slave_align.pl +++ b/scripts/21.multipron_align/slave_align.pl @@ -38,4 +38,16 @@ # Do not pass $etc on the command line: multipron_align.pl resolves etc/ from # sphinx_train.cfg; an extra path is forwarded to multipron_align.py as a bogus arg. my $rv = RunTool($^X, $logfile, 0, $pl); +if ($rv != 0) { + my $sphinx_log = catfile( + $ST::CFG_BASE_DIR, "multipron_align", + "$ST::CFG_EXPTNAME.multipron_align.log" + ); + LogError( + "multipron_align exit $rv. Stderr from Python is in $logfile; " + . "if sphinx3_align started, full command/output is in $sphinx_log. " + . "Typical causes: sphinx3_align missing under \$CFG_BIN_DIR, " + . "missing CI HMM dir under \$CFG_MODEL_DIR, or sphinx3_align nonzero.\n" + ); +} exit($rv); From c5977e3e0a0e09ef340f50dbe391a97cdc98a45b Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 15:14:29 -0400 Subject: [PATCH 10/12] Allow two-pass and fix some CI --- README.md | 13 +++++++++++++ etc/sphinx_train.cfg | 13 +++++++++++++ scripts/CMakeLists.txt | 1 + scripts/sphinxtrain | 1 + templates/librispeech/etc/sphinx_train.cfg | 13 +++++++++++++ 5 files changed, 41 insertions(+) diff --git a/README.md b/README.md index f65e24ed..222f286a 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,19 @@ program built with the rest of the tree (`cmake --build build`). Set `$CFG_MULTIPRON` to `no` in `etc/sphinx_train.cfg` if you want to skip stage 21 and use only the original transcripts for later stages. +For **semi** (`.semi.`) and **PTM** (`.ptm.`) models, the template turns +multipron and stage 22 off automatically; they are intended for the +**continuous** (`.cont.`) path. + +Optional second CI pass (stage 22) + +After multipron (stage 21), you can set `$CFG_CI_REESTIMATE_AFTER_MULTIPRON` +to `yes` to run **stage 22**, which repeats the same CI training driver as +stage 20. Once the multipron transcript exists, `GetLists()` uses it for +Baum–Welch, so this pass trains CI models on pronunciation-disambiguated +text. It performs a **full** CI cycle again (including flat initialization) +and **replaces** the CI model directory, roughly doubling CI time. Default +is `no`. You can also install SphinxTrain system-wide if you so desire: diff --git a/etc/sphinx_train.cfg b/etc/sphinx_train.cfg index a8bd6cb5..eae043a6 100644 --- a/etc/sphinx_train.cfg +++ b/etc/sphinx_train.cfg @@ -204,6 +204,19 @@ $CFG_FORCE_ALIGN_BEAM = 1e-60; # 11.force_align. Beam for multipron_align follows $CFG_FORCE_ALIGN_BEAM (else 1e-308). $CFG_MULTIPRON = 'yes'; +# Second CI pass after multipron (stage 22): re-run stage-20 training with supervision +# from the multipron transcript (GetLists selects it once the file from stage 21 exists). +# Re-runs flat init and replaces the CI model directory—roughly doubles CI wall time. +# Default no; requires CFG_MULTIPRON and a successful stage 21. +$CFG_CI_REESTIMATE_AFTER_MULTIPRON = 'no'; + +# Stages 21–22 are exercised with continuous ($CFG_HMM_TYPE eq '.cont.') models; semi and +# PTM use a different training/alignment path, so keep them off unless you are on .cont.. +if ($CFG_HMM_TYPE ne '.cont.') { + $CFG_MULTIPRON = 'no'; + $CFG_CI_REESTIMATE_AFTER_MULTIPRON = 'no'; +} + # Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl). $CFG_VOCAB_DICT = 'no'; $CFG_VOCAB_DICTIONARY = "$CFG_LIST_DIR/$CFG_EXPTNAME.vocab.dic"; diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt index b34c78aa..7039e6c2 100644 --- a/scripts/CMakeLists.txt +++ b/scripts/CMakeLists.txt @@ -11,6 +11,7 @@ set(SCRIPTDIRS 12.vtln_align 20.ci_hmm 21.multipron_align +22.ci_hmm_multipron 30.cd_hmm_untied 40.buildtrees 45.prunetree diff --git a/scripts/sphinxtrain b/scripts/sphinxtrain index 27c6d560..10c4fc67 100755 --- a/scripts/sphinxtrain +++ b/scripts/sphinxtrain @@ -93,6 +93,7 @@ steps = [ "12.vtln_align/slave_align.pl", "20.ci_hmm/slave_convg.pl", "21.multipron_align/slave_align.pl", + "22.ci_hmm_multipron/slave_convg.pl", "30.cd_hmm_untied/slave_convg.pl", "40.buildtrees/slave.treebuilder.pl", "45.prunetree/slave.state-tying.pl", diff --git a/templates/librispeech/etc/sphinx_train.cfg b/templates/librispeech/etc/sphinx_train.cfg index e4f18332..c98da2ab 100644 --- a/templates/librispeech/etc/sphinx_train.cfg +++ b/templates/librispeech/etc/sphinx_train.cfg @@ -204,6 +204,19 @@ $CFG_FORCE_ALIGN_BEAM = 1e-60; # 11.force_align. Beam for multipron_align follows $CFG_FORCE_ALIGN_BEAM (else 1e-308). $CFG_MULTIPRON = 'yes'; +# Second CI pass after multipron (stage 22): re-run stage-20 training with supervision +# from the multipron transcript (GetLists selects it once the file from stage 21 exists). +# Re-runs flat init and replaces the CI model directory—roughly doubles CI wall time. +# Default no; requires CFG_MULTIPRON and a successful stage 21. +$CFG_CI_REESTIMATE_AFTER_MULTIPRON = 'no'; + +# Stages 21–22 are exercised with continuous ($CFG_HMM_TYPE eq '.cont.') models; semi and +# PTM use a different training/alignment path, so keep them off unless you are on .cont.. +if ($CFG_HMM_TYPE ne '.cont.') { + $CFG_MULTIPRON = 'no'; + $CFG_CI_REESTIMATE_AFTER_MULTIPRON = 'no'; +} + # Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl). $CFG_VOCAB_DICT = 'no'; $CFG_VOCAB_DICTIONARY = "$CFG_LIST_DIR/$CFG_EXPTNAME.vocab.dic"; From 29d5096438168ee4076283d362fae87d38289252 Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 15:20:45 -0400 Subject: [PATCH 11/12] Add scripts/22.ci_hmm_multipron for install target CMake installs SCRIPTDIRS including 22.ci_hmm_multipron; the directory was listed but not in the tree on CI, so cmake --install failed. --- scripts/22.ci_hmm_multipron/slave_convg.pl | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100755 scripts/22.ci_hmm_multipron/slave_convg.pl diff --git a/scripts/22.ci_hmm_multipron/slave_convg.pl b/scripts/22.ci_hmm_multipron/slave_convg.pl new file mode 100755 index 00000000..84ae97aa --- /dev/null +++ b/scripts/22.ci_hmm_multipron/slave_convg.pl @@ -0,0 +1,36 @@ +#!/usr/bin/env perl +## Optional second CI training pass after multipron (stage 21). When enabled, runs the +## same driver as stage 20; Baum-Welch uses GetLists(), which selects the multipron +## transcript once $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription +## exists. This re-runs flat initialization and replaces the CI model directory (same as +## a fresh stage 20). Set $CFG_CI_REESTIMATE_AFTER_MULTIPRON to 'no' (default) to skip. + +use strict; +use warnings; +use File::Basename qw(dirname); +use File::Spec::Functions qw(catfile updir); + +use lib catfile(dirname($0), updir(), 'lib'); +use SphinxTrain::Config; +use SphinxTrain::Util; + +$| = 1; +Log("MODULE: 22 Optional second CI pass (multipron supervision)\n"); + +if (!defined $ST::CFG_CI_REESTIMATE_AFTER_MULTIPRON + || $ST::CFG_CI_REESTIMATE_AFTER_MULTIPRON ne 'yes') { + Log("Skipped (set \$CFG_CI_REESTIMATE_AFTER_MULTIPRON to 'yes' to enable)\n"); + exit 0; +} +if (!ShouldUseMultipronTranscript()) { + LogError( + "CFG_CI_REESTIMATE_AFTER_MULTIPRON is yes but multipron transcript is not available. " + . "Keep \$CFG_MULTIPRON enabled, complete stage 21, or set " + . "CFG_CI_REESTIMATE_AFTER_MULTIPRON to no.\n" + ); + exit 1; +} + +my $target = catfile(dirname($0), updir(), '20.ci_hmm', 'slave_convg.pl'); +Log("Invoking $target (same as stage 20; supervision = multipron transcript via GetLists)\n"); +exec $^X, $target, @ARGV or die "exec $target: $!\n"; From 321fd173feb7dd8c42fd0952dbc9ad4dba96acca Mon Sep 17 00:00:00 2001 From: Kevin Lenzo Date: Wed, 8 Apr 2026 15:22:24 -0400 Subject: [PATCH 12/12] Update Util --- scripts/lib/SphinxTrain/Util.pm | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/lib/SphinxTrain/Util.pm b/scripts/lib/SphinxTrain/Util.pm index ce08e4f1..e4e90a88 100644 --- a/scripts/lib/SphinxTrain/Util.pm +++ b/scripts/lib/SphinxTrain/Util.pm @@ -355,8 +355,10 @@ sub RunTool { $returnvalue = 1; last; } - $error_count++ if m/(ERROR).*/; - $warning_count++ if m/(WARNING).*/; + # Match Sphinx-style log lines only. A bare /ERROR/ false-positives on words like + # "TERROR" in transcripts; real messages are "ERROR: ..." (see sphinxbase err_msg). + $error_count++ if m/^\s*ERROR:/; + $warning_count++ if m/^\s*WARN(?:ING)?:/; if ($ctl_counter) { # Keep track of progress being made. $processed_counter++ if (/.*(utt\>).*/);