From ff7ed660d0e7fac29f4b4bf1be56495d96108dec Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 13:52:29 -0400
Subject: [PATCH 01/12] Fix CMake macOS SDK and BLAS/LAPACK cache after Xcode
 upgrades

When the cached sysroot path is missing, set CMAKE_OSX_SYSROOT from xcrun. Before add_subdirectory(src), clear cached FindBLAS, FindLAPACK, and MATH paths that reference a removed SDK or a different SDK tree than the active sysroot. Remove duplicate find_library(MATH_LIBRARY) in src/CMakeLists.txt.
---
 CMakeLists.txt     | 88 ++++++++++++++++++++++++++++++++++++++++++++++
 src/CMakeLists.txt |  1 -
 2 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86789c6a..bbf8e4c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,28 @@
 cmake_minimum_required(VERSION 3.14) # I like pie
 
+# After an Xcode upgrade, CMakeCache may keep CMAKE_OSX_SYSROOT pointing at a removed
+# SDK; the link step then fails (e.g. missing .../usr/lib/libm.tbd). Reset to the
+# active SDK when the cached path is missing (must run before project()).
+if(APPLE)
+  set(_sphinxtrain_sysroot_ok FALSE)
+  if(CMAKE_OSX_SYSROOT AND EXISTS "${CMAKE_OSX_SYSROOT}")
+    set(_sphinxtrain_sysroot_ok TRUE)
+  endif()
+  if(NOT _sphinxtrain_sysroot_ok)
+    execute_process(
+      COMMAND xcrun --sdk macosx --show-sdk-path
+      OUTPUT_VARIABLE _sphinxtrain_osx_sdk
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      ERROR_QUIET
+      RESULT_VARIABLE _sphinxtrain_xcrun_rv
+    )
+    if(_sphinxtrain_xcrun_rv EQUAL 0 AND _sphinxtrain_osx_sdk AND EXISTS "${_sphinxtrain_osx_sdk}")
+      set(CMAKE_OSX_SYSROOT "${_sphinxtrain_osx_sdk}" CACHE PATH "macOS SDK path" FORCE)
+      message(STATUS "CMAKE_OSX_SYSROOT -> ${CMAKE_OSX_SYSROOT}")
+    endif()
+  endif()
+endif()
+
 project(SphinxTrain VERSION 5.0.0
   DESCRIPTION "CMU Sphinx Trainer"
   HOMEPAGE_URL "https://github.com/cmusphinx/sphinxtrain")
@@ -66,6 +89,71 @@ set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 configure_file(config.h.in config.h)
 add_definitions(-DHAVE_CONFIG_H)
 
+# FindBLAS / FindLAPACK cache absolute paths under the SDK; after an Xcode upgrade those
+# files may move (e.g. .../MacOSX26.1.sdk/...) while CMAKE_OSX_SYSROOT was fixed above.
+# Also drop paths that still exist but live under a different SDK tree than CMAKE_OSX_SYSROOT.
+if(APPLE)
+  set(_sphinxtrain_apple_libcache_vars
+    BLAS_Accelerate_LIBRARY
+    BLAS_blas_LIBRARY
+    BLAS_LIBRARY
+    LAPACK_Accelerate_LIBRARY
+    LAPACK_lapack_LIBRARY
+    LAPACK_LIBRARY
+    MATH_LIBRARY
+  )
+  foreach(_sphinxtrain_v IN LISTS _sphinxtrain_apple_libcache_vars)
+    if(${_sphinxtrain_v})
+      set(_sphinxtrain_p "${${_sphinxtrain_v}}")
+      if(_sphinxtrain_p MATCHES "^/")
+        set(_sphinxtrain_drop FALSE)
+        if(NOT EXISTS "${_sphinxtrain_p}")
+          set(_sphinxtrain_drop TRUE)
+        elseif(CMAKE_OSX_SYSROOT AND _sphinxtrain_p MATCHES "/[^/]+\\.sdk/")
+          string(LENGTH "${CMAKE_OSX_SYSROOT}" _sphinxtrain_sl)
+          if(_sphinxtrain_sl GREATER 0)
+            string(SUBSTRING "${_sphinxtrain_p}" 0 ${_sphinxtrain_sl} _sphinxtrain_head)
+            if(NOT _sphinxtrain_head STREQUAL CMAKE_OSX_SYSROOT)
+              set(_sphinxtrain_drop TRUE)
+            endif()
+          endif()
+        endif()
+        if(_sphinxtrain_drop)
+          unset(${_sphinxtrain_v} CACHE)
+        endif()
+      endif()
+    endif()
+  endforeach()
+  foreach(_sphinxtrain_listvar BLAS_LIBRARIES LAPACK_LIBRARIES)
+    if(${_sphinxtrain_listvar})
+      set(_sphinxtrain_drop_libs FALSE)
+      foreach(_sphinxtrain_item IN LISTS ${_sphinxtrain_listvar})
+        if(_sphinxtrain_item MATCHES "^/")
+          if(NOT EXISTS "${_sphinxtrain_item}")
+            set(_sphinxtrain_drop_libs TRUE)
+          elseif(CMAKE_OSX_SYSROOT AND _sphinxtrain_item MATCHES "/[^/]+\\.sdk/")
+            string(LENGTH "${CMAKE_OSX_SYSROOT}" _sphinxtrain_sl)
+            if(_sphinxtrain_sl GREATER 0)
+              string(SUBSTRING "${_sphinxtrain_item}" 0 ${_sphinxtrain_sl} _sphinxtrain_head)
+              if(NOT _sphinxtrain_head STREQUAL CMAKE_OSX_SYSROOT)
+                set(_sphinxtrain_drop_libs TRUE)
+              endif()
+            endif()
+          endif()
+        endif()
+      endforeach()
+      if(_sphinxtrain_drop_libs)
+        unset(${_sphinxtrain_listvar} CACHE)
+        if(_sphinxtrain_listvar STREQUAL "BLAS_LIBRARIES")
+          unset(BLAS_FOUND CACHE)
+        else()
+          unset(LAPACK_FOUND CACHE)
+        endif()
+      endif()
+    endif()
+  endforeach()
+endif()
+
 # Compile all the things
 add_subdirectory(src)
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6a80637b..f86b3a36 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -89,7 +89,6 @@ libs/libsphinxbase/util/f2c_lite.c
 )
 find_package(BLAS)
 find_package(LAPACK)
-find_library(MATH_LIBRARY m)
 if(NOT LAPACK_FOUND)
   message("System-wide LAPACK not found, will use internal version")
   add_library(sphinxtrain ${SRCS} ${LAPACK_SRCS})

From f22fb55978fdf154dc95e770f3342bb0def44bf8 Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 13:52:29 -0400
Subject: [PATCH 02/12] sphinx3_align: handle align_frame failure without
 aborting

Return -1 from align_frame on bad alignment instead of asserting. On failure, emit a reference outsent line when possible and continue. Log how many outsent fallbacks were written at the end of a run.
---
 src/programs/sphinx3_align/main_align.c | 57 +++++++++++++++++++++++--
 src/programs/sphinx3_align/s3_align.c   |  7 ++-
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/src/programs/sphinx3_align/main_align.c b/src/programs/sphinx3_align/main_align.c
index 100e5ab0..52ae4f5b 100644
--- a/src/programs/sphinx3_align/main_align.c
+++ b/src/programs/sphinx3_align/main_align.c
@@ -41,6 +41,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <ctype.h>
 #include <assert.h>
 
 #include <sphinxbase/bio.h>
@@ -174,6 +175,8 @@ static FILE *outctlfp = NULL;
 static const char *sentfile;
 static FILE *sentfp = NULL;
 
+static int32 n_align_outsent_fallback;
+
 static char *s2stsegdir = NULL;
 static char *stsegdir = NULL;
 static char *phsegdir = NULL;
@@ -659,6 +662,37 @@ write_outctl(FILE * fp, char *uttctl)
     fflush(fp);
 }
 
+static void
+trim_spaces(char *s)
+{
+    char *p;
+    char *end;
+
+    if (!s || !*s)
+        return;
+    for (p = s; *p && isspace((unsigned char) *p); p++);
+    if (p != s)
+        memmove(s, p, strlen(p) + 1);
+    end = s + strlen(s);
+    while (end > s && isspace((unsigned char) end[-1]))
+        *--end = '\0';
+}
+
+/**
+ * Emit one -outsent line from the reference (-insent) text when Viterbi alignment fails,
+ * so downstream jobs stay line-aligned with -ctl (one line per utterance).
+ */
+static void
+write_outsent_insent_fallback(FILE * fp, char *sent, const char *uttid)
+{
+    if (!fp)
+        return;
+    trim_spaces(sent);
+    fprintf(fp, "%s (%s)\n", sent, uttid);
+    fflush(fp);
+    ++n_align_outsent_fallback;
+    E_INFO("Wrote -outsent fallback (reference text) for %s\n", uttid);
+}
 
 
 /*
@@ -681,6 +715,7 @@ align_utt(char *sent,           /* In: Reference transcript */
     if (nfr <= (w << 1)) {
         E_ERROR("Utterance %s < %d frames (%d); ignored\n", uttid,
                 (w << 1) + 1, nfr);
+        write_outsent_insent_fallback(outsentfp, sent, uttid);
         return;
     }
 
@@ -698,7 +733,7 @@ align_utt(char *sent,           /* In: Reference transcript */
         ptmr_stop(timers + tmr_utt);
 
         E_ERROR("No sentence HMM; no alignment for %s\n", uttid);
-
+        write_outsent_insent_fallback(outsentfp, sent, uttid);
         return;
     }
 
@@ -751,7 +786,16 @@ align_utt(char *sent,           /* In: Reference transcript */
 
         /* Step alignment one frame forward */
         ptmr_start(timers + tmr_align);
-        align_frame(ascr->senscr);
+        if (align_frame(ascr->senscr) < 0) {
+            ptmr_stop(timers + tmr_align);
+            ptmr_stop(timers + tmr_utt);
+            ptmr_stop(&tm_utt);
+            ptmr_stop(&tm_ovrhd);
+            E_ERROR("Alignment failed mid-utterance for %s\n", uttid);
+            write_outsent_insent_fallback(outsentfp, sent, uttid);
+            align_destroy_sent_hmm();
+            return;
+        }
         ptmr_stop(timers + tmr_align);
         ptmr_stop(timers + tmr_utt);
     }
@@ -761,8 +805,10 @@ align_utt(char *sent,           /* In: Reference transcript */
     printf("\n");
 
     /* Wind up alignment for this utterance */
-    if (align_end_utt(&stseg, &phseg, &wdseg) < 0)
+    if (align_end_utt(&stseg, &phseg, &wdseg) < 0) {
         E_ERROR("Final state not reached; no alignment for %s\n\n", uttid);
+        write_outsent_insent_fallback(outsentfp, sent, uttid);
+    }
     else {
         if (s2stsegdir)
             write_s2stseg(s2stsegdir, stseg, uttid, ctlspec, cmd_ln_boolean_r(kbc->config, "-s2cdsen"));
@@ -914,6 +960,7 @@ utt_align(void *data, utt_res_t * ur, int32 sf, int32 ef, char *uttid)
                 ("Utt %s: Input file read (%s) with extension (%s) failed \n",
                  uttid, ur->uttfile, cepext);
         }
+        write_outsent_insent_fallback(outsentfp, sent, uttid);
     }
     else {
         E_INFO("%s: %d input frames\n", uttid, nfr);
@@ -1039,6 +1086,10 @@ main(int32 argc, char *argv[])
                tm_utt.t_tot_elapsed / (tot_nfr * 0.01));
     }
 
+    if (n_align_outsent_fallback > 0)
+        E_INFO("Utterances with -outsent reference fallback (alignment skipped): %d\n",
+               n_align_outsent_fallback);
+
     if (outsentfp)
         fclose(outsentfp);
     if (outctlfp)
diff --git a/src/programs/sphinx3_align/s3_align.c b/src/programs/sphinx3_align/s3_align.c
index 9bb7d594..e33bc108 100644
--- a/src/programs/sphinx3_align/s3_align.c
+++ b/src/programs/sphinx3_align/s3_align.c
@@ -1078,7 +1078,12 @@ align_frame(int32 * senscr)
                 }
             }
         }
-        assert(tmpbest > (int32) 0x80000000);
+        if (tmpbest <= (int32) 0x80000000) {
+            E_ERROR
+                ("No active predecessor for aligner state (frame %d); try a wider -beam (e.g. 1e-308)\n",
+                 curfrm);
+            return -1;
+        }
 
         s->newscore = tmpbest + senscr[s->sen];
         s->newhist = tmphist;

From 2272750f4f82da595da5f63b6184c388b3aec2e3 Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 13:52:32 -0400
Subject: [PATCH 03/12] Add multipron force alignment and CFG_MULTIPRON
 training hook

Add CFG_MULTIPRON to sphinx_train.cfg: when yes, Baum-Welch and related steps use BASE_DIR/multipron_align/EXPT.multipron.transcription. Extend GetLists and the untied-HMM, lattice, and BW slaves accordingly. Add multipron_align.pl and multipron_align.py to merge dicts, run sphinx3_align, and write that transcript (beam from CFG_FORCE_ALIGN_BEAM or default 1e-308).
---
 etc/sphinx_train.cfg                          |   8 +
 scripts/11.force_align/multipron_align.pl     |  34 ++
 scripts/11.force_align/multipron_align.py     | 349 ++++++++++++++++++
 scripts/30.cd_hmm_untied/slave_convg.pl       |   4 +
 scripts/60.lattice_generation/slave_genlat.pl |   4 +
 scripts/65.mmie_train/baum_welch.pl           |   4 +
 scripts/lib/SphinxTrain/Util.pm               |   4 +
 templates/librispeech/etc/sphinx_train.cfg    |   8 +
 8 files changed, 415 insertions(+)
 create mode 100644 scripts/11.force_align/multipron_align.pl
 create mode 100644 scripts/11.force_align/multipron_align.py

diff --git a/etc/sphinx_train.cfg b/etc/sphinx_train.cfg
index fc566527..52f6b01c 100644
--- a/etc/sphinx_train.cfg
+++ b/etc/sphinx_train.cfg
@@ -196,6 +196,14 @@ $CFG_FORCE_ALIGN_MODELDIR = "$CFG_MODEL_DIR/$CFG_EXPTNAME.falign_ci_$CFG_DIRLABE
 # rejected for bad alignment.
 $CFG_FORCE_ALIGN_BEAM = 1e-60;
 
+# Multipron alignment (sphinx3_align): set to yes after you run
+#   perl $CFG_SPHINXTRAIN_DIR/scripts/11.force_align/multipron_align.pl
+# so Baum-Welch and related steps use
+#   $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription
+# Keep $CFG_FORCEDALIGN = no unless you also run stock 11.force_align. Alignment search width
+# for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts).
+$CFG_MULTIPRON = 'no';
+
 # Calculate an LDA/MLLT transform?
 $CFG_LDA_MLLT = 'no';
 # Dimensionality of LDA/MLLT output
diff --git a/scripts/11.force_align/multipron_align.pl b/scripts/11.force_align/multipron_align.pl
new file mode 100644
index 00000000..62e4fe5b
--- /dev/null
+++ b/scripts/11.force_align/multipron_align.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/env perl
+## Run multipron_align.py using paths from sphinx_train.cfg (project base = $CFG_BASE_DIR).
+##
+## Usage (from training project directory, same as sphinxtrain run):
+##   perl path/to/scripts/11.force_align/multipron_align.pl [--dry-run] [--first-n N] [--binary PATH]
+##
+## See $CFG_MULTIPRON in etc/sphinx_train.cfg (set to yes after alignment for training).
+
+use strict;
+use warnings;
+use File::Basename qw(dirname);
+use File::Spec::Functions qw(catfile updir);
+
+use lib catfile(dirname($0), updir(), 'lib');
+use SphinxTrain::Config;
+
+my @args = @ARGV;
+my $dry = (@args && $args[0] eq '--dry-run') ? shift @args : 0;
+
+my $etc = catfile($ST::CFG_BASE_DIR, 'etc');
+if (!-d $etc) {
+    die "Missing directory $etc (run from project after setup, or fix \$CFG_BASE_DIR)\n";
+}
+
+my $py = catfile($ST::CFG_SPHINXTRAIN_DIR, 'scripts', '11.force_align', 'multipron_align.py');
+if (!-f $py) {
+    die "Missing $py\n";
+}
+
+my @py = ('python', $py);
+push @py, '--dry-run' if $dry;
+push @py, $etc, @args;
+exec @py;
+die "exec failed: $!\n";
diff --git a/scripts/11.force_align/multipron_align.py b/scripts/11.force_align/multipron_align.py
new file mode 100644
index 00000000..9aa4efdb
--- /dev/null
+++ b/scripts/11.force_align/multipron_align.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python
+"""
+Force-align the training set with sphinx3_align and write SphinxTrain-style transcripts
+that reflect the Viterbi-best pronunciation variants (dict word strings from -outsent).
+
+This mirrors scripts/11.force_align/slave_align.pl phases 3–4 (dictionary merge + aligninput)
+and scripts/11.force_align/force_align.pl (sphinx3_align invocation), without running the
+full SphinxTrain ``sphinxtrain run`` pipeline.
+
+Training layout comes from ``etc/sphinx_train.cfg`` (``CFG_BASE_DIR``, ``CFG_EXPTNAME``,
+and the usual align inputs). Outputs go under ``$CFG_BASE_DIR/multipron_align/``. Set
+``CFG_MULTIPRON`` to ``yes`` in that file after alignment so Baum-Welch uses the generated
+transcript. Optional: ``CFG_SPHINX3_ALIGN_BINARY`` if ``sphinx3_align`` is not under
+``CFG_BIN_DIR``. Beam width follows ``CFG_FORCE_ALIGN_BEAM``, or ``--beam``, defaulting to
+``1e-308`` if unset.
+
+Usage (from project base, after ``sphinxtrain -t TASK setup``)::
+
+  perl scripts/11.force_align/multipron_align.pl [--dry-run] [--first-n N] [--binary PATH]
+
+Or::
+
+  python scripts/11.force_align/multipron_align.py [--dry-run] <project_etc_dir> \\
+      [--binary PATH] [--out-dir DIR] [--first-n N] [--beam W]
+
+Environment:
+  SPHINX3_ALIGN  overrides the configured sphinx3_align path when set.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+# One-line Perl assignments: $CFG_NAME = value;
+_ASSIGN = re.compile(
+    r"^\s*\$((?:CFG|ST)[A-Z0-9_]*)\s*=\s*(.*?)\s*;\s*(?:#.*)?$",
+    re.MULTILINE,
+)
+
+
+def _parse_sphinx_train_cfg(text: str) -> dict[str, str]:
+    raw: dict[str, str] = {}
+    for m in _ASSIGN.finditer(text):
+        name, val = m.group(1), m.group(2).strip()
+        if val.startswith(("'", '"')) and len(val) >= 2 and val[-1] == val[0]:
+            raw[name] = val[1:-1]
+        else:
+            raw[name] = val.rstrip(";").strip()
+    out = dict(raw)
+    for _ in range(24):
+        changed = False
+        for k, v in list(out.items()):
+            if "$" not in v:
+                continue
+            nv = v
+            for name, repl in out.items():
+                nv = nv.replace(f"${{{name}}}", repl)
+                nv = nv.replace(f"${name}", repl)
+            if nv != v:
+                out[k] = nv
+                changed = True
+        if not changed:
+            break
+    return out
+
+
+def _load_cfg(etc: Path) -> dict[str, str]:
+    cfg = etc / "sphinx_train.cfg"
+    if not cfg.is_file():
+        print(f"Missing {cfg}", file=sys.stderr)
+        sys.exit(1)
+    return _parse_sphinx_train_cfg(cfg.read_text(encoding="utf-8", errors="replace"))
+
+
+def _build_falign_dicts(
+    dictionary: Path,
+    filler: Path,
+    out_dict: Path,
+    out_fdict: Path,
+) -> dict[str, str]:
+    """Replicate slave_align.pl phase 3: SIL-only fdict; merge other fillers into main dict."""
+    silences: dict[str, str] = {}
+    fillers: dict[str, str] = {}
+    fdict_sil_lines: list[str] = []
+    filler_text = filler.read_text(encoding="utf-8", errors="replace").splitlines()
+    for line in filler_text:
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        parts = line.split()
+        word = parts[0]
+        phones = parts[1:]
+        if len(phones) == 1 and re.search(r"^SIL[be]?$", phones[0], re.I):
+            silences[word] = phones[0]
+            fdict_sil_lines.append(f"{word}\t{phones[0]}\n")
+        else:
+            fillers[word] = " ".join(phones)
+    out_fdict.write_text("".join(fdict_sil_lines), encoding="utf-8")
+    merged = dictionary.read_text(encoding="utf-8", errors="replace")
+    for k in sorted(fillers):
+        merged += f"{k}\t{fillers[k]}\n"
+    out_dict.write_text(merged, encoding="utf-8")
+    return silences
+
+
+def _make_aligninput(
+    transcript: Path,
+    out_aligninput: Path,
+    silences: dict[str, str],
+    strip_variant_digits: bool,
+) -> None:
+    """Replicate slave_align.pl phase 4 (silence removal + optional variant stripping)."""
+    lines_out: list[str] = []
+    for line in transcript.read_text(encoding="utf-8", errors="replace").splitlines():
+        s = line.rstrip("\n")
+        for sil_word in silences:
+            s = re.sub(rf"(^|\s){re.escape(sil_word)}(\s|$)", r"\1\2", s)
+        if strip_variant_digits:
+            s = re.sub(r"\(\d+\)", "", s)
+        s = s.strip()
+        s = re.sub(r"\s+", " ", s)
+        lines_out.append(s + "\n")
+    out_aligninput.write_text("".join(lines_out), encoding="utf-8")
+
+
+def _outsent_to_transcription(outsent: Path, out_transcription: Path) -> None:
+    """sphinx3_align -outsent -> SphinxTrain ``<s> ... </s> (uttid)`` (single wrap)."""
+    lines: list[str] = []
+    for line in outsent.read_text(encoding="utf-8", errors="replace").splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        m = re.search(r"\(([^()]+)\)\s*$", line)
+        if not m:
+            print(f"Skipping unparsable outsent line: {line!r}", file=sys.stderr)
+            continue
+        uttid = m.group(1).strip()
+        words = line[: m.start()].strip().split()
+        while words and words[0] == "<s>":
+            words = words[1:]
+        while words and words[-1] == "</s>":
+            words = words[:-1]
+        inner = " ".join(words)
+        if not inner:
+            lines.append(f"<s> </s> ({uttid})\n")
+            continue
+        lines.append(f"<s> {inner} </s> ({uttid})\n")
+    out_transcription.write_text("".join(lines), encoding="utf-8")
+
+
+def _write_skipped_utts_from_log(log_file: Path, out_path: Path) -> None:
+    if not log_file.is_file():
+        return
+    text = log_file.read_text(encoding="utf-8", errors="replace")
+    ids: list[str] = []
+    for line in text.splitlines():
+        if "Wrote -outsent fallback" not in line:
+            continue
+        m = re.search(r"for (\S+)\s*$", line)
+        if m:
+            ids.append(m.group(1))
+    if not ids:
+        return
+    out_path.write_text("".join(f"{u}\n" for u in ids), encoding="utf-8")
+    print(f"Wrote skipped-utterance list ({len(ids)}): {out_path}")
+
+
+def main() -> int:
+    argv = sys.argv[1:]
+    dry_run = False
+    if argv and argv[0] == "--dry-run":
+        dry_run = True
+        argv = argv[1:]
+    if not argv:
+        print(__doc__, file=sys.stderr)
+        return 2
+    etc = Path(argv[0]).resolve()
+    bin_override: str | None = None
+    out_dir: Path | None = None
+    first_n: int | None = None
+    beam_override: str | None = None
+    rest = argv[1:]
+    i = 0
+    while i < len(rest):
+        if rest[i] == "--binary" and i + 1 < len(rest):
+            bin_override = rest[i + 1]
+            i += 2
+            continue
+        if rest[i] == "--out-dir" and i + 1 < len(rest):
+            out_dir = Path(rest[i + 1]).resolve()
+            i += 2
+            continue
+        if rest[i] == "--first-n" and i + 1 < len(rest):
+            first_n = int(rest[i + 1])
+            i += 2
+            continue
+        if rest[i] == "--beam" and i + 1 < len(rest):
+            beam_override = rest[i + 1]
+            i += 2
+            continue
+        print(f"Unexpected argument: {rest[i]}", file=sys.stderr)
+        return 2
+    cfg = _load_cfg(etc)
+    base_dir = Path(cfg["CFG_BASE_DIR"])
+    expt = cfg["CFG_EXPTNAME"]
+    out_root = out_dir if out_dir is not None else base_dir / "multipron_align"
+
+    align_bin = (
+        bin_override
+        or os.environ.get("SPHINX3_ALIGN")
+        or cfg.get("CFG_SPHINX3_ALIGN_BINARY")
+        or str(Path(cfg["CFG_BIN_DIR"]) / "sphinx3_align")
+    )
+    align_path = Path(align_bin)
+    if not dry_run and not align_path.is_file():
+        print(
+            f"sphinx3_align not found at {align_path} "
+            "(build the sphinx3_align target, set CFG_SPHINX3_ALIGN_BINARY in the project "
+            "cfg, or set SPHINX3_ALIGN / --binary).",
+            file=sys.stderr,
+        )
+        return 1
+
+    dictionary = Path(cfg["CFG_DICTIONARY"])
+    filler = Path(cfg["CFG_FILLERDICT"])
+    listoffiles = Path(cfg["CFG_LISTOFFILES"])
+    transcript = Path(cfg["CFG_TRANSCRIPTFILE"])
+    ctlcount = "1000000"
+    feat_dir = Path(cfg["CFG_FEATFILES_DIR"])
+    feat_ext = "." + cfg["CFG_FEATFILE_EXTENSION"].lstrip(".")
+    hmm_dir = Path(cfg["CFG_MODEL_DIR"]) / f"{expt}.ci_{cfg['CFG_DIRLABEL']}"
+
+    if not hmm_dir.is_dir():
+        print(f"Missing HMM directory {hmm_dir} (train CI models first).", file=sys.stderr)
+        return 1
+    if not feat_dir.is_dir():
+        print(f"Missing features directory {feat_dir} (run feature extraction).", file=sys.stderr)
+        return 1
+    if not transcript.is_file():
+        print(f"Missing transcript {transcript}", file=sys.stderr)
+        return 1
+
+    out_transcription = out_root / f"{expt}.multipron.transcription"
+
+    falign_dict = out_root / f"{expt}.falign.dict"
+    falign_fdict = out_root / f"{expt}.falign.fdict"
+    aligninput = out_root / f"{expt}.aligninput"
+    ctl_path = listoffiles
+    if first_n is not None and first_n > 0:
+        ctl_path = out_root / f"{expt}.multipron.shorter.fileids"
+        ctlcount = str(first_n)
+    outsent = out_root / f"{expt}.multipron.outsent"
+    log_file = out_root / f"{expt}.multipron_align.log"
+
+    # sphinx3_align: -beam is a linear probability passed to logs3(); smaller p =>
+    # wider Viterbi pruning (see s3_align.c). Default 1e-308 is effectively full width.
+    beam = beam_override or cfg.get("CFG_FORCE_ALIGN_BEAM") or "1e-308"
+    statepdeffn = cfg["CFG_HMM_TYPE"]
+    mwfloor = "1e-8"
+    minvar = "1e-4"
+
+    args = [
+        str(align_path),
+        "-hmm",
+        str(hmm_dir),
+        "-senmgau",
+        statepdeffn,
+        "-mixwfloor",
+        mwfloor,
+        "-varfloor",
+        minvar,
+        "-dict",
+        str(falign_dict),
+        "-fdict",
+        str(falign_fdict),
+        "-ctl",
+        str(ctl_path),
+        "-ctloffset",
+        "0",
+        "-ctlcount",
+        ctlcount,
+        "-cepdir",
+        str(feat_dir),
+        "-cepext",
+        feat_ext,
+        "-insent",
+        str(aligninput),
+        "-outsent",
+        str(outsent),
+        "-beam",
+        beam,
+        "-agc",
+        cfg["CFG_AGC"],
+        "-cmn",
+        cfg["CFG_CMN"],
+        "-varnorm",
+        cfg["CFG_VARNORM"],
+        "-feat",
+        cfg["CFG_FEATURE"],
+        "-ceplen",
+        cfg["CFG_VECTOR_LENGTH"],
+        "-insert_sil",
+        "1",
+    ]
+
+    print("Doing multipron force alignment (sphinx3_align)...")
+    if dry_run:
+        print("Dry run: would create dicts and aligninput under:\n ", out_root)
+        print("Dry run: would run:\n ", " ".join(args))
+        print(f"Would write transcripts to {out_transcription}")
+        return 0
+
+    out_root.mkdir(parents=True, exist_ok=True)
+    print(f"Writing merged dicts under {out_root}...")
+    silences = _build_falign_dicts(dictionary, filler, falign_dict, falign_fdict)
+    _make_aligninput(transcript, aligninput, silences, strip_variant_digits=True)
+    if first_n is not None and first_n > 0:
+        ids = listoffiles.read_text(encoding="utf-8", errors="replace").splitlines()
+        ins_lines = aligninput.read_text(encoding="utf-8", errors="replace").splitlines()
+        n = min(first_n, len(ids), len(ins_lines))
+        ctl_path.write_text("\n".join(ids[:n]) + "\n", encoding="utf-8")
+        aligninput.write_text("\n".join(ins_lines[:n]) + "\n", encoding="utf-8")
+        print(f"Limited run to first {n} utterances (--first-n).")
+
+    log_file.write_text(
+        "Command:\n" + " ".join(args) + "\n\n", encoding="utf-8"
+    )
+    with log_file.open("ab") as logf:
+        proc = subprocess.run(args, stdout=logf, stderr=subprocess.STDOUT)
+    if proc.returncode != 0:
+        print(f"sphinx3_align failed (see {log_file})", file=sys.stderr)
+        return proc.returncode
+    if not outsent.is_file():
+        print(f"Expected output missing: {outsent}", file=sys.stderr)
+        return 1
+    _outsent_to_transcription(outsent, out_transcription)
+    skipped_utts = out_root / f"{expt}.multipron.skipped_utts"
+    _write_skipped_utts_from_log(log_file, skipped_utts)
+    print(f"Wrote {outsent}")
+    print(f"Wrote {out_transcription}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/30.cd_hmm_untied/slave_convg.pl b/scripts/30.cd_hmm_untied/slave_convg.pl
index 2a9d8092..03f77abe 100644
--- a/scripts/30.cd_hmm_untied/slave_convg.pl
+++ b/scripts/30.cd_hmm_untied/slave_convg.pl
@@ -144,6 +144,10 @@ ()
   if ($ST::CFG_FORCEDALIGN eq "yes") {
       $listoffiles   = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles";
       $transcriptfile  = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts";
+  } elsif (defined($ST::CFG_MULTIPRON)
+	   && $ST::CFG_MULTIPRON eq "yes") {
+      $listoffiles = $ST::CFG_LISTOFFILES;
+      $transcriptfile  = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription";
   } elsif ($ST::CFG_VTLN eq "yes") {
       $listoffiles   = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles";
       $transcriptfile  = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts";
diff --git a/scripts/60.lattice_generation/slave_genlat.pl b/scripts/60.lattice_generation/slave_genlat.pl
index 4ffef05c..f803796b 100644
--- a/scripts/60.lattice_generation/slave_genlat.pl
+++ b/scripts/60.lattice_generation/slave_genlat.pl
@@ -71,6 +71,10 @@
 if ($ST::CFG_FORCEDALIGN eq "yes") {
     $listoffiles   = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles";
     $transcriptfile  = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts";
+} elsif (defined($ST::CFG_MULTIPRON)
+	 && $ST::CFG_MULTIPRON eq "yes") {
+    $listoffiles = $ST::CFG_LISTOFFILES;
+    $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription";
 } elsif ($ST::CFG_VTLN eq "yes") {
     $listoffiles   = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles";
     $transcriptfile  = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts";
diff --git a/scripts/65.mmie_train/baum_welch.pl b/scripts/65.mmie_train/baum_welch.pl
index b81fe39d..f0f5da56 100644
--- a/scripts/65.mmie_train/baum_welch.pl
+++ b/scripts/65.mmie_train/baum_welch.pl
@@ -101,6 +101,10 @@
 } elsif ($ST::CFG_FORCEDALIGN eq "yes") {
     $listoffiles   = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles";
     $transcriptfile = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts";
+} elsif (defined($ST::CFG_MULTIPRON)
+	 && $ST::CFG_MULTIPRON eq "yes") {
+    $listoffiles = $ST::CFG_LISTOFFILES;
+    $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription";
 } elsif ($ST::CFG_VTLN eq "yes") {
     $listoffiles   = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles";
     $transcriptfile = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts";
diff --git a/scripts/lib/SphinxTrain/Util.pm b/scripts/lib/SphinxTrain/Util.pm
index a530d711..a1f3ff7a 100644
--- a/scripts/lib/SphinxTrain/Util.pm
+++ b/scripts/lib/SphinxTrain/Util.pm
@@ -634,6 +634,10 @@ sub GetLists {
     } elsif ($ST::CFG_FORCEDALIGN eq "yes") {
 	$listoffiles   = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles";
 	$transcriptfile  = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts";
+    } elsif (defined($ST::CFG_MULTIPRON)
+	     && $ST::CFG_MULTIPRON eq "yes") {
+	$listoffiles = $ST::CFG_LISTOFFILES;
+	$transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription";
     } else {
 	$listoffiles = $ST::CFG_LISTOFFILES;
 	$transcriptfile = $ST::CFG_TRANSCRIPTFILE;
diff --git a/templates/librispeech/etc/sphinx_train.cfg b/templates/librispeech/etc/sphinx_train.cfg
index 2f50e071..00dbb921 100644
--- a/templates/librispeech/etc/sphinx_train.cfg
+++ b/templates/librispeech/etc/sphinx_train.cfg
@@ -196,6 +196,14 @@ $CFG_FORCE_ALIGN_MODELDIR = "$CFG_MODEL_DIR/$CFG_EXPTNAME.falign_ci_$CFG_DIRLABE
 # rejected for bad alignment.
 $CFG_FORCE_ALIGN_BEAM = 1e-60;
 
+# Multipron alignment (sphinx3_align): set to yes after you run
+#   perl $CFG_SPHINXTRAIN_DIR/scripts/11.force_align/multipron_align.pl
+# so Baum-Welch and related steps use
+#   $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription
+# Keep $CFG_FORCEDALIGN = no unless you also run stock 11.force_align. Alignment search width
+# for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts).
+$CFG_MULTIPRON = 'no';
+
 # Calculate an LDA/MLLT transform?
 $CFG_LDA_MLLT = 'no';
 # Dimensionality of LDA/MLLT output

From b0a23541c6f026667eb2e934576edebb5da8981f Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 13:52:36 -0400
Subject: [PATCH 04/12] Add vocab_dict filter and optional 00a.vocab_dict
 script

Python reduces the dictionary to words in the transcript vocabulary while keeping pronunciation variants. Perl script is invoked from the optional 00a.vocab_dict Makefile step; uses python like other SphinxTrain drivers. Define CFG_VOCAB_DICT and CFG_VOCAB_DICTIONARY in sphinx_train.cfg.
---
 etc/sphinx_train.cfg                       |  4 ++
 python/cmusphinx/vocab_dict.py             | 83 ++++++++++++++++++++++
 scripts/00a.vocab_dict/make_vocab_dict.pl  | 78 ++++++++++++++++++++
 templates/librispeech/etc/sphinx_train.cfg |  4 ++
 4 files changed, 169 insertions(+)
 create mode 100644 python/cmusphinx/vocab_dict.py
 create mode 100644 scripts/00a.vocab_dict/make_vocab_dict.pl

diff --git a/etc/sphinx_train.cfg b/etc/sphinx_train.cfg
index 52f6b01c..a1d712c0 100644
--- a/etc/sphinx_train.cfg
+++ b/etc/sphinx_train.cfg
@@ -204,6 +204,10 @@ $CFG_FORCE_ALIGN_BEAM = 1e-60;
 # for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts).
 $CFG_MULTIPRON = 'no';
 
+# Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl).
+$CFG_VOCAB_DICT = 'no';
+$CFG_VOCAB_DICTIONARY = "$CFG_LIST_DIR/$CFG_EXPTNAME.vocab.dic";
+
 # Calculate an LDA/MLLT transform?
 $CFG_LDA_MLLT = 'no';
 # Dimensionality of LDA/MLLT output
diff --git a/python/cmusphinx/vocab_dict.py b/python/cmusphinx/vocab_dict.py
new file mode 100644
index 00000000..18f10b31
--- /dev/null
+++ b/python/cmusphinx/vocab_dict.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2009 Carnegie Mellon University
+#
+# You may copy and modify this freely under the same terms as
+# Sphinx-III
+
+"""Filter dictionary to vocabulary from transcripts.
+
+Creates a reduced dictionary containing only words that appear in the
+training/test transcripts, preserving all pronunciation variants.
+"""
+
+import sys
+from cmusphinx import s3dict
+
+
+def load_vocab(vocab_path):
+    """Load vocabulary from file (one word per line)."""
+    vocab = set()
+    with open(vocab_path) as fh:
+        for line in fh:
+            word = line.strip()
+            if word:
+                vocab.add(word)
+    return vocab
+
+
+def filter_dict(indict, vocab, outfh):
+    """Filter dictionary to vocabulary, write to outfh."""
+    in_words = set(indict.words())
+    kept = vocab & in_words
+    unused = in_words - vocab
+
+    for w in sorted(kept):
+        for i, phones in enumerate(indict.alts(w), 1):
+            if i == 1:
+                outfh.write("%s %s\n" % (w, " ".join(phones)))
+            else:
+                outfh.write("%s(%d) %s\n" % (w, i, " ".join(phones)))
+
+    return kept, unused
+
+
+def main():
+    if len(sys.argv) < 4:
+        print("Usage: %s DICT VOCAB OUTDICT" % sys.argv[0], file=sys.stderr)
+        sys.exit(1)
+
+    dict_path, vocab_path, out_path = sys.argv[1:4]
+
+    vocab = load_vocab(vocab_path)
+    indict = s3dict.open(dict_path)
+    in_words = set(indict.words())
+
+    with open(out_path, "w") as outfh:
+        kept, unused = filter_dict(indict, vocab, outfh)
+
+    # Stats
+    full_prons = sum(indict.maxalt[w] for w in in_words)
+    kept_prons = sum(indict.maxalt[w] for w in kept)
+
+    print("Full dictionary:     %7d words" % len(in_words))
+    print("Transcript vocab:    %7d words" % len(vocab))
+    print("Kept in reduced:     %7d words" % len(kept))
+    print("Removed (unused):    %7d words" % len(unused))
+    print("Full pronunciations: %7d" % full_prons)
+    print("Kept pronunciations: %7d" % kept_prons)
+
+    # Write unused words for tracing
+    unused_path = out_path.replace(".dic", ".unused")
+    with open(unused_path, "w") as fh:
+        for w in sorted(unused):
+            fh.write("%s\n" % w)
+    print("Unused words:        %s" % unused_path)
+
+    # Warn about OOV
+    oov = vocab - in_words
+    if oov:
+        print("OOV words:           %7d (missing from dict)" % len(oov),
+              file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/00a.vocab_dict/make_vocab_dict.pl b/scripts/00a.vocab_dict/make_vocab_dict.pl
new file mode 100644
index 00000000..19f36e42
--- /dev/null
+++ b/scripts/00a.vocab_dict/make_vocab_dict.pl
@@ -0,0 +1,78 @@
+#!/usr/bin/perl
+## ====================================================================
+##
+## Copyright (c) 1996-2000 Carnegie Mellon University.  All rights
+## reserved.
+##
+## Redistribution and use in source and binary forms, with or without
+## modification, are permitted provided that the following conditions
+## are met:
+##
+## 1. Redistributions of source code must retain the above copyright
+##    notice, this list of conditions and the following disclaimer.
+##
+## 2. Redistributions in binary form must reproduce the above copyright
+##    notice, this list of conditions and the following disclaimer in
+##    the documentation and/or other materials provided with the
+##    distribution.
+##
+## This work was supported in part by funding from the Defense Advanced
+## Research Projects Agency and the National Science Foundation of the
+## United States of America, and the CMU Sphinx Speech Consortium.
+##
+## THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
+## ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+## THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+## PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+## NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+##
+## ====================================================================
+
+use strict;
+use File::Basename;
+use File::Spec::Functions;
+
+use lib catdir(dirname($0), updir(), 'lib');
+use SphinxTrain::Config;
+use SphinxTrain::Util;
+
+$| = 1; # Turn on autoflushing
+Log("MODULE: 00a Create vocabulary-restricted dictionary\n");
+
+if ($ST::CFG_VOCAB_DICT ne "yes") {
+    Log("Skipped (set \$CFG_VOCAB_DICT = 'yes' to enable)\n");
+    exit(0);
+}
+
+my $logdir = "$ST::CFG_LOG_DIR/00a.vocab_dict";
+mkdir($logdir, 0777) unless -d $logdir;
+my $logfile = "$logdir/$ST::CFG_EXPTNAME.vocab_dict.log";
+
+my $vocabfile = "$ST::CFG_LIST_DIR/$ST::CFG_EXPTNAME.vocab";
+my $outdict = $ST::CFG_VOCAB_DICTIONARY;
+
+unless (-f $vocabfile) {
+    LogError("Vocabulary file not found: $vocabfile\n");
+    LogError("Run 00.verify first to generate it\n");
+    exit(1);
+}
+
+Log("Creating vocabulary-restricted dictionary\n");
+Log("  Input dict:  $ST::CFG_DICTIONARY\n");
+Log("  Vocabulary:  $vocabfile\n");
+Log("  Output dict: $outdict\n");
+
+$ENV{PYTHONPATH} .= ':' . catdir($ST::CFG_SPHINXTRAIN_DIR, 'python');
+my $rv = RunTool("python", $logfile, 0,
+		 "-m", "cmusphinx.vocab_dict",
+		 $ST::CFG_DICTIONARY,
+		 $vocabfile,
+		 $outdict);
+
+exit($rv);
diff --git a/templates/librispeech/etc/sphinx_train.cfg b/templates/librispeech/etc/sphinx_train.cfg
index 00dbb921..94272c84 100644
--- a/templates/librispeech/etc/sphinx_train.cfg
+++ b/templates/librispeech/etc/sphinx_train.cfg
@@ -204,6 +204,10 @@ $CFG_FORCE_ALIGN_BEAM = 1e-60;
 # for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts).
 $CFG_MULTIPRON = 'no';
 
+# Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl).
+$CFG_VOCAB_DICT = 'no';
+$CFG_VOCAB_DICTIONARY = "$CFG_LIST_DIR/$CFG_EXPTNAME.vocab.dic";
+
 # Calculate an LDA/MLLT transform?
 $CFG_LDA_MLLT = 'no';
 # Dimensionality of LDA/MLLT output

From b9a8e32d08eb74a3944a7a36bc868d208196ecd0 Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 13:52:38 -0400
Subject: [PATCH 05/12] Gitignore
 docs/plan-multi-pronunciation-iterative-training.md

Keeps optional local design notes out of the repository.
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 21789886..7288ba2e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,6 @@ autom4te.cache
 include/stamp-h2
 ylwrap
 build
+
+# Local design notes (not part of the distributed tree)
+docs/plan-multi-pronunciation-iterative-training.md

From dbaaa4446ea2c0afbfc04a9d9734174de17582fe Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 14:30:06 -0400
Subject: [PATCH 06/12] Multipron on by default: stage 21 after CI,
 CFG_MULTIPRON=no to disable

Insert training stage 21.multipron_align after CI to run multipron_align.pl
when CFG_MULTIPRON is set and not no. Use multipron transcript for CD and
later only when the file exists (Util::ShouldUseMultipronTranscript). Default
CFG_MULTIPRON to yes in sphinx_train.cfg; omit or unset variable keeps legacy
behavior without stage 21.

Stage 21 driver: import dirname for lib path; do not pass an extra etc path
to multipron_align.pl. sphinxtrain decodes POSIX wait status from os.system()
so failed Perl stages fail the shell on Unix.
---
 etc/sphinx_train.cfg                          | 14 +++----
 scripts/11.force_align/multipron_align.pl     |  2 +-
 scripts/11.force_align/multipron_align.py     | 10 ++---
 scripts/21.multipron_align/slave_align.pl     | 41 +++++++++++++++++++
 scripts/30.cd_hmm_untied/slave_convg.pl       |  5 +--
 scripts/60.lattice_generation/slave_genlat.pl |  5 +--
 scripts/65.mmie_train/baum_welch.pl           |  5 +--
 scripts/lib/SphinxTrain/Util.pm               | 20 +++++++--
 scripts/sphinxtrain                           | 25 ++++++++---
 templates/librispeech/etc/sphinx_train.cfg    | 14 +++----
 10 files changed, 102 insertions(+), 39 deletions(-)
 create mode 100755 scripts/21.multipron_align/slave_align.pl

diff --git a/etc/sphinx_train.cfg b/etc/sphinx_train.cfg
index a1d712c0..a8bd6cb5 100644
--- a/etc/sphinx_train.cfg
+++ b/etc/sphinx_train.cfg
@@ -196,13 +196,13 @@ $CFG_FORCE_ALIGN_MODELDIR = "$CFG_MODEL_DIR/$CFG_EXPTNAME.falign_ci_$CFG_DIRLABE
 # rejected for bad alignment.
 $CFG_FORCE_ALIGN_BEAM = 1e-60;
 
-# Multipron alignment (sphinx3_align): set to yes after you run
-#   perl $CFG_SPHINXTRAIN_DIR/scripts/11.force_align/multipron_align.pl
-# so Baum-Welch and related steps use
-#   $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription
-# Keep $CFG_FORCEDALIGN = no unless you also run stock 11.force_align. Alignment search width
-# for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts).
-$CFG_MULTIPRON = 'no';
+# Multipron: after CI, stage 21 runs multipron alignment (sphinx3_align); CD and later
+# steps use $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription when present.
+# If this variable is missing (old configs), multipron is off. Set to no to skip stage 21
+# and use only $CFG_TRANSCRIPTFILE everywhere. Requires
+# sphinx3_align in $CFG_BIN_DIR. Keep $CFG_FORCEDALIGN = no unless you also run stock
+# 11.force_align. Beam for multipron_align follows $CFG_FORCE_ALIGN_BEAM (else 1e-308).
+$CFG_MULTIPRON = 'yes';
 
 # Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl).
 $CFG_VOCAB_DICT = 'no';
diff --git a/scripts/11.force_align/multipron_align.pl b/scripts/11.force_align/multipron_align.pl
index 62e4fe5b..cb513bbe 100644
--- a/scripts/11.force_align/multipron_align.pl
+++ b/scripts/11.force_align/multipron_align.pl
@@ -4,7 +4,7 @@
 ## Usage (from training project directory, same as sphinxtrain run):
 ##   perl path/to/scripts/11.force_align/multipron_align.pl [--dry-run] [--first-n N] [--binary PATH]
 ##
-## See $CFG_MULTIPRON in etc/sphinx_train.cfg (set to yes after alignment for training).
+## See $CFG_MULTIPRON in etc/sphinx_train.cfg (no disables multipron; stage 21 runs this after CI by default).
 
 use strict;
 use warnings;
diff --git a/scripts/11.force_align/multipron_align.py b/scripts/11.force_align/multipron_align.py
index 9aa4efdb..57a10b9c 100644
--- a/scripts/11.force_align/multipron_align.py
+++ b/scripts/11.force_align/multipron_align.py
@@ -8,11 +8,11 @@
 full SphinxTrain ``sphinxtrain run`` pipeline.
 
 Training layout comes from ``etc/sphinx_train.cfg`` (``CFG_BASE_DIR``, ``CFG_EXPTNAME``,
-and the usual align inputs). Outputs go under ``$CFG_BASE_DIR/multipron_align/``. Set
-``CFG_MULTIPRON`` to ``yes`` in that file after alignment so Baum-Welch uses the generated
-transcript. Optional: ``CFG_SPHINX3_ALIGN_BINARY`` if ``sphinx3_align`` is not under
-``CFG_BIN_DIR``. Beam width follows ``CFG_FORCE_ALIGN_BEAM``, or ``--beam``, defaulting to
-``1e-308`` if unset.
+and the usual align inputs). Outputs go under ``$CFG_BASE_DIR/multipron_align/``.
+Normally stage 21 runs this after CI when ``CFG_MULTIPRON`` is not ``no``. Set
+``CFG_MULTIPRON`` to ``no`` to disable multipron entirely. Optional:
+``CFG_SPHINX3_ALIGN_BINARY`` if ``sphinx3_align`` is not under ``CFG_BIN_DIR``. Beam width
+follows ``CFG_FORCE_ALIGN_BEAM``, or ``--beam``, defaulting to ``1e-308`` if unset.
 
 Usage (from project base, after ``sphinxtrain -t TASK setup``)::
 
diff --git a/scripts/21.multipron_align/slave_align.pl b/scripts/21.multipron_align/slave_align.pl
new file mode 100755
index 00000000..956cefdc
--- /dev/null
+++ b/scripts/21.multipron_align/slave_align.pl
@@ -0,0 +1,41 @@
+#!/usr/bin/env perl
+## After CI HMM training, run multipron force alignment so later stages can use
+## pronunciation-disambiguated transcripts. Skipped when $CFG_MULTIPRON is no.
+
+use strict;
+use warnings;
+use File::Basename qw(dirname);
+use File::Spec::Functions qw(catfile catdir updir);
+
+use lib catdir(dirname($0), updir(), 'lib');
+use SphinxTrain::Config;
+use SphinxTrain::Util;
+
+$| = 1;
+Log("MODULE: 21 Multipron alignment (after CI)\n");
+
+if (!defined($ST::CFG_MULTIPRON) || $ST::CFG_MULTIPRON eq "no") {
+    Log("Skipped (\$CFG_MULTIPRON unset or no)\n");
+    exit 0;
+}
+
+my $pl = catfile($ST::CFG_SPHINXTRAIN_DIR, qw(scripts 11.force_align multipron_align.pl));
+my $etc = catfile($ST::CFG_BASE_DIR, "etc");
+unless (-f $pl) {
+    LogError("Missing $pl\n");
+    exit 1;
+}
+unless (-d $etc) {
+    LogError("Missing directory $etc\n");
+    exit 1;
+}
+
+my $logdir = "$ST::CFG_LOG_DIR/21.multipron_align";
+mkdir($logdir, 0777) unless -d $logdir;
+my $logfile = "$logdir/$ST::CFG_EXPTNAME.multipron_align_wrap.log";
+
+Log("Running multipron_align.pl (requires CI models under \$CFG_MODEL_DIR)\n");
+# Do not pass $etc on the command line: multipron_align.pl resolves etc/ from
+# sphinx_train.cfg; an extra path is forwarded to multipron_align.py as a bogus arg.
+my $rv = RunTool($^X, $logfile, 0, $pl);
+exit($rv);
diff --git a/scripts/30.cd_hmm_untied/slave_convg.pl b/scripts/30.cd_hmm_untied/slave_convg.pl
index 03f77abe..0898c565 100644
--- a/scripts/30.cd_hmm_untied/slave_convg.pl
+++ b/scripts/30.cd_hmm_untied/slave_convg.pl
@@ -144,10 +144,9 @@ ()
   if ($ST::CFG_FORCEDALIGN eq "yes") {
       $listoffiles   = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles";
       $transcriptfile  = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts";
-  } elsif (defined($ST::CFG_MULTIPRON)
-	   && $ST::CFG_MULTIPRON eq "yes") {
+  } elsif (ShouldUseMultipronTranscript()) {
       $listoffiles = $ST::CFG_LISTOFFILES;
-      $transcriptfile  = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription";
+      $transcriptfile  = MultipronTranscriptFile();
   } elsif ($ST::CFG_VTLN eq "yes") {
       $listoffiles   = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles";
       $transcriptfile  = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts";
diff --git a/scripts/60.lattice_generation/slave_genlat.pl b/scripts/60.lattice_generation/slave_genlat.pl
index f803796b..758e0bfd 100644
--- a/scripts/60.lattice_generation/slave_genlat.pl
+++ b/scripts/60.lattice_generation/slave_genlat.pl
@@ -71,10 +71,9 @@
 if ($ST::CFG_FORCEDALIGN eq "yes") {
     $listoffiles   = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles";
     $transcriptfile  = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts";
-} elsif (defined($ST::CFG_MULTIPRON)
-	 && $ST::CFG_MULTIPRON eq "yes") {
+} elsif (ShouldUseMultipronTranscript()) {
     $listoffiles = $ST::CFG_LISTOFFILES;
-    $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription";
+    $transcriptfile = MultipronTranscriptFile();
 } elsif ($ST::CFG_VTLN eq "yes") {
     $listoffiles   = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles";
     $transcriptfile  = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts";
diff --git a/scripts/65.mmie_train/baum_welch.pl b/scripts/65.mmie_train/baum_welch.pl
index f0f5da56..b98bce6d 100644
--- a/scripts/65.mmie_train/baum_welch.pl
+++ b/scripts/65.mmie_train/baum_welch.pl
@@ -101,10 +101,9 @@
 } elsif ($ST::CFG_FORCEDALIGN eq "yes") {
     $listoffiles   = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles";
     $transcriptfile = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts";
-} elsif (defined($ST::CFG_MULTIPRON)
-	 && $ST::CFG_MULTIPRON eq "yes") {
+} elsif (ShouldUseMultipronTranscript()) {
     $listoffiles = $ST::CFG_LISTOFFILES;
-    $transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription";
+    $transcriptfile = MultipronTranscriptFile();
 } elsif ($ST::CFG_VTLN eq "yes") {
     $listoffiles   = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedfiles";
     $transcriptfile = "$ST::CFG_BASE_DIR/vtlnout/${ST::CFG_EXPTNAME}.alignedtranscripts";
diff --git a/scripts/lib/SphinxTrain/Util.pm b/scripts/lib/SphinxTrain/Util.pm
index a1f3ff7a..ce08e4f1 100644
--- a/scripts/lib/SphinxTrain/Util.pm
+++ b/scripts/lib/SphinxTrain/Util.pm
@@ -45,7 +45,8 @@ use vars qw(@ISA @EXPORT);
            ImgSrc LogWarning LogError LogProgress
            LogStatus Converged RunTool SubstParams
            RunScript LaunchScript WaitForScript GetLists GetDict
-	   WaitForConvergence TiedWaitForConvergence WaitForMMIEConverge Trim);
+	   WaitForConvergence TiedWaitForConvergence WaitForMMIEConverge Trim
+	   MultipronTranscriptFile ShouldUseMultipronTranscript);
 
 use Sys::Hostname;
 use File::Basename;
@@ -621,6 +622,18 @@ sub GetDict {
     }
 }
 
+sub MultipronTranscriptFile {
+    return "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription";
+}
+
+# Multipron is on unless CFG_MULTIPRON is explicitly no. Use the multipron transcript
+# only after stage 21 has produced the file (after CI when multipron is enabled).
+sub ShouldUseMultipronTranscript {
+    return 0 unless defined($ST::CFG_MULTIPRON);
+    return 0 if $ST::CFG_MULTIPRON eq "no";
+    return -f MultipronTranscriptFile();
+}
+
 sub GetLists {
     # aligned transcripts and the list of aligned files is obtained as a result
     # of (03.) forced alignment or (04.) VTLN
@@ -634,10 +647,9 @@ sub GetLists {
     } elsif ($ST::CFG_FORCEDALIGN eq "yes") {
 	$listoffiles   = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedfiles";
 	$transcriptfile  = "$ST::CFG_BASE_DIR/falignout/${ST::CFG_EXPTNAME}.alignedtranscripts";
-    } elsif (defined($ST::CFG_MULTIPRON)
-	     && $ST::CFG_MULTIPRON eq "yes") {
+    } elsif (ShouldUseMultipronTranscript()) {
 	$listoffiles = $ST::CFG_LISTOFFILES;
-	$transcriptfile = "$ST::CFG_BASE_DIR/multipron_align/${ST::CFG_EXPTNAME}.multipron.transcription";
+	$transcriptfile = MultipronTranscriptFile();
     } else {
 	$listoffiles = $ST::CFG_LISTOFFILES;
 	$transcriptfile = $ST::CFG_TRANSCRIPTFILE;
diff --git a/scripts/sphinxtrain b/scripts/sphinxtrain
index eaac4f4a..27c6d560 100755
--- a/scripts/sphinxtrain
+++ b/scripts/sphinxtrain
@@ -11,6 +11,15 @@ sphinxbinpath = ""
 sphinxpath = ""
 
 
+def _perl_script_exit_code(status):
+    """Exit code from os.system() after running a child (POSIX wait-encoded status)."""
+    if os.name == "nt":
+        return status
+    if status == 0:
+        return 0
+    return (status >> 8) & 0xFF
+
+
 def find_paths():
     global training_basedir
     global sphinxbinpath
@@ -83,6 +92,7 @@ steps = [
     "11.force_align/slave_align.pl",
     "12.vtln_align/slave_align.pl",
     "20.ci_hmm/slave_convg.pl",
+    "21.multipron_align/slave_align.pl",
     "30.cd_hmm_untied/slave_convg.pl",
     "40.buildtrees/slave.treebuilder.pl",
     "45.prunetree/slave.state-tying.pl",
@@ -103,8 +113,9 @@ def run_stages(stages):
             number = step.split("/")[0].split(".")[0]
             if name == stage or number == stage:
                 ret = os.system("perl '" + sphinxpath + "/scripts/" + step + "'")
-                if ret != 0:
-                    exit(ret)
+                ec = _perl_script_exit_code(ret)
+                if ec != 0:
+                    sys.exit(ec)
 
 
 def run_from(stage):
@@ -115,16 +126,18 @@ def run_from(stage):
         if name == stage or number == stage or found:
             found = True
             ret = os.system("perl '" + sphinxpath + "/scripts/" + step + "'")
-            if ret != 0:
-                exit(ret)
+            ec = _perl_script_exit_code(ret)
+            if ec != 0:
+                sys.exit(ec)
 
 
 def run():
     print("Running the training")
     for step in steps:
         ret = os.system("perl '" + sphinxpath + "/scripts/" + step + "'")
-        if ret != 0:
-            exit(ret)
+        ec = _perl_script_exit_code(ret)
+        if ec != 0:
+            sys.exit(ec)
 
 
 def usage():
diff --git a/templates/librispeech/etc/sphinx_train.cfg b/templates/librispeech/etc/sphinx_train.cfg
index 94272c84..e4f18332 100644
--- a/templates/librispeech/etc/sphinx_train.cfg
+++ b/templates/librispeech/etc/sphinx_train.cfg
@@ -196,13 +196,13 @@ $CFG_FORCE_ALIGN_MODELDIR = "$CFG_MODEL_DIR/$CFG_EXPTNAME.falign_ci_$CFG_DIRLABE
 # rejected for bad alignment.
 $CFG_FORCE_ALIGN_BEAM = 1e-60;
 
-# Multipron alignment (sphinx3_align): set to yes after you run
-#   perl $CFG_SPHINXTRAIN_DIR/scripts/11.force_align/multipron_align.pl
-# so Baum-Welch and related steps use
-#   $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription
-# Keep $CFG_FORCEDALIGN = no unless you also run stock 11.force_align. Alignment search width
-# for multipron_align is $CFG_FORCE_ALIGN_BEAM (wider, e.g. 1e-308, rejects fewer utts).
-$CFG_MULTIPRON = 'no';
+# Multipron: after CI, stage 21 runs multipron alignment (sphinx3_align); CD and later
+# steps use $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription when present.
+# If this variable is missing (old configs), multipron is off. Set to no to skip stage 21
+# and use only $CFG_TRANSCRIPTFILE everywhere. Requires
+# sphinx3_align in $CFG_BIN_DIR. Keep $CFG_FORCEDALIGN = no unless you also run stock
+# 11.force_align. Beam for multipron_align follows $CFG_FORCE_ALIGN_BEAM (else 1e-308).
+$CFG_MULTIPRON = 'yes';
 
 # Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl).
 $CFG_VOCAB_DICT = 'no';

From a03926a163ac0aff1ca83d9894a7f2afc6641e55 Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 14:47:11 -0400
Subject: [PATCH 07/12] docs: document multipron stage 21

Describe default multipron alignment after CI, sphinx3_align, and how to
disable via CFG_MULTIPRON. Fix Acknowledgments heading spelling.
---
 README.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5a9d23f3..f65e24ed 100644
--- a/README.md
+++ b/README.md
@@ -106,6 +106,17 @@ You do not need to install SphinxTrain to run it, simply run
 training directory.  Note that you do need to build and install
 PocketSphinx for evaluation to work properly, however.
 
+Multipron alignment (optional stage 21)
+----------------------------------------
+
+After CI HMM training, the default configuration runs multipron force
+alignment so pronunciation-disambiguated transcripts can be written
+under `multipron_align/` in your project. This uses the `sphinx3_align`
+program built with the rest of the tree (`cmake --build build`).
+
+Set `$CFG_MULTIPRON` to `no` in `etc/sphinx_train.cfg` if you want to
+skip stage 21 and use only the original transcripts for later stages.
+
 You can also install SphinxTrain system-wide if you so desire:
 
     sudo cmake --build build --target install
@@ -138,7 +149,7 @@ procedure is identical to the Unix installation.
 
 Also, check the section title "All Platforms" above.
 
-Acknowldegments
+Acknowledgments
 ---------------
 
 The development of this code has included support at different times

From 4feb5ae95a954b8d227f5ca0177f2007f9e2930c Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 14:52:30 -0400
Subject: [PATCH 08/12] Include new scripts in CMakeLists

---
 scripts/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index c41b087e..b34c78aa 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -2,6 +2,7 @@ set(SCRIPTDIRS
 0000.g2p_train
 000.comp_feat
 00.verify
+00a.vocab_dict
 01.lda_train
 02.mllt_train
 05.vector_quantize
@@ -9,6 +10,7 @@ set(SCRIPTDIRS
 11.force_align
 12.vtln_align
 20.ci_hmm
+21.multipron_align
 30.cd_hmm_untied
 40.buildtrees
 45.prunetree

From 6f8e7a9a604d4f67b95022bb6737fe95415ffb9d Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 15:01:29 -0400
Subject: [PATCH 09/12] Fix CI

---
 scripts/11.force_align/multipron_align.pl |  7 ++++++-
 scripts/21.multipron_align/slave_align.pl | 12 ++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/scripts/11.force_align/multipron_align.pl b/scripts/11.force_align/multipron_align.pl
index cb513bbe..e83c657e 100644
--- a/scripts/11.force_align/multipron_align.pl
+++ b/scripts/11.force_align/multipron_align.pl
@@ -27,7 +27,12 @@
     die "Missing $py\n";
 }
 
-my @py = ('python', $py);
+# Prefer $PYTHON; else python3 on Unix (many images have no `python` symlink); else python.
+my $pyexe = $ENV{PYTHON};
+if (!defined($pyexe) || $pyexe eq '') {
+    $pyexe = ($^O eq 'MSWin32') ? 'python' : 'python3';
+}
+my @py = ($pyexe, $py);
 push @py, '--dry-run' if $dry;
 push @py, $etc, @args;
 exec @py;
diff --git a/scripts/21.multipron_align/slave_align.pl b/scripts/21.multipron_align/slave_align.pl
index 956cefdc..605fd179 100755
--- a/scripts/21.multipron_align/slave_align.pl
+++ b/scripts/21.multipron_align/slave_align.pl
@@ -38,4 +38,16 @@
 # Do not pass $etc on the command line: multipron_align.pl resolves etc/ from
 # sphinx_train.cfg; an extra path is forwarded to multipron_align.py as a bogus arg.
 my $rv = RunTool($^X, $logfile, 0, $pl);
+if ($rv != 0) {
+    my $sphinx_log = catfile(
+        $ST::CFG_BASE_DIR, "multipron_align",
+        "$ST::CFG_EXPTNAME.multipron_align.log"
+    );
+    LogError(
+        "multipron_align exit $rv. Stderr from Python is in $logfile; "
+        . "if sphinx3_align started, full command/output is in $sphinx_log. "
+        . "Typical causes: sphinx3_align missing under \$CFG_BIN_DIR, "
+        . "missing CI HMM dir under \$CFG_MODEL_DIR, or sphinx3_align nonzero.\n"
+    );
+}
 exit($rv);

From c5977e3e0a0e09ef340f50dbe391a97cdc98a45b Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 15:14:29 -0400
Subject: [PATCH 10/12] Allow two-pass and fix some CI

---
 README.md                                  | 13 +++++++++++++
 etc/sphinx_train.cfg                       | 13 +++++++++++++
 scripts/CMakeLists.txt                     |  1 +
 scripts/sphinxtrain                        |  1 +
 templates/librispeech/etc/sphinx_train.cfg | 13 +++++++++++++
 5 files changed, 41 insertions(+)

diff --git a/README.md b/README.md
index f65e24ed..222f286a 100644
--- a/README.md
+++ b/README.md
@@ -116,6 +116,19 @@ program built with the rest of the tree (`cmake --build build`).
 
 Set `$CFG_MULTIPRON` to `no` in `etc/sphinx_train.cfg` if you want to
 skip stage 21 and use only the original transcripts for later stages.
+For **semi** (`.semi.`) and **PTM** (`.ptm.`) models, the template turns
+multipron and stage 22 off automatically; they are intended for the
+**continuous** (`.cont.`) path.
+
+Optional second CI pass (stage 22)
+
+After multipron (stage 21), you can set `$CFG_CI_REESTIMATE_AFTER_MULTIPRON`
+to `yes` to run **stage 22**, which repeats the same CI training driver as
+stage 20. Once the multipron transcript exists, `GetLists()` uses it for
+Baum–Welch, so this pass trains CI models on pronunciation-disambiguated
+text. It performs a **full** CI cycle again (including flat initialization)
+and **replaces** the CI model directory, roughly doubling CI time. Default
+is `no`.
 
 You can also install SphinxTrain system-wide if you so desire:
 
diff --git a/etc/sphinx_train.cfg b/etc/sphinx_train.cfg
index a8bd6cb5..eae043a6 100644
--- a/etc/sphinx_train.cfg
+++ b/etc/sphinx_train.cfg
@@ -204,6 +204,19 @@ $CFG_FORCE_ALIGN_BEAM = 1e-60;
 # 11.force_align. Beam for multipron_align follows $CFG_FORCE_ALIGN_BEAM (else 1e-308).
 $CFG_MULTIPRON = 'yes';
 
+# Second CI pass after multipron (stage 22): re-run stage-20 training with supervision
+# from the multipron transcript (GetLists selects it once the file from stage 21 exists).
+# Re-runs flat init and replaces the CI model directory—roughly doubles CI wall time.
+# Default no; requires CFG_MULTIPRON and a successful stage 21.
+$CFG_CI_REESTIMATE_AFTER_MULTIPRON = 'no';
+
+# Stages 21–22 are exercised with continuous ($CFG_HMM_TYPE eq '.cont.') models; semi and
+# PTM use a different training/alignment path, so keep them off unless you are on .cont..
+if ($CFG_HMM_TYPE ne '.cont.') {
+  $CFG_MULTIPRON = 'no';
+  $CFG_CI_REESTIMATE_AFTER_MULTIPRON = 'no';
+}
+
 # Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl).
 $CFG_VOCAB_DICT = 'no';
 $CFG_VOCAB_DICTIONARY = "$CFG_LIST_DIR/$CFG_EXPTNAME.vocab.dic";
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index b34c78aa..7039e6c2 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -11,6 +11,7 @@ set(SCRIPTDIRS
 12.vtln_align
 20.ci_hmm
 21.multipron_align
+22.ci_hmm_multipron
 30.cd_hmm_untied
 40.buildtrees
 45.prunetree
diff --git a/scripts/sphinxtrain b/scripts/sphinxtrain
index 27c6d560..10c4fc67 100755
--- a/scripts/sphinxtrain
+++ b/scripts/sphinxtrain
@@ -93,6 +93,7 @@ steps = [
     "12.vtln_align/slave_align.pl",
     "20.ci_hmm/slave_convg.pl",
     "21.multipron_align/slave_align.pl",
+    "22.ci_hmm_multipron/slave_convg.pl",
     "30.cd_hmm_untied/slave_convg.pl",
     "40.buildtrees/slave.treebuilder.pl",
     "45.prunetree/slave.state-tying.pl",
diff --git a/templates/librispeech/etc/sphinx_train.cfg b/templates/librispeech/etc/sphinx_train.cfg
index e4f18332..c98da2ab 100644
--- a/templates/librispeech/etc/sphinx_train.cfg
+++ b/templates/librispeech/etc/sphinx_train.cfg
@@ -204,6 +204,19 @@ $CFG_FORCE_ALIGN_BEAM = 1e-60;
 # 11.force_align. Beam for multipron_align follows $CFG_FORCE_ALIGN_BEAM (else 1e-308).
 $CFG_MULTIPRON = 'yes';
 
+# Second CI pass after multipron (stage 22): re-run stage-20 training with supervision
+# from the multipron transcript (GetLists selects it once the file from stage 21 exists).
+# Re-runs flat init and replaces the CI model directory—roughly doubles CI wall time.
+# Default no; requires CFG_MULTIPRON and a successful stage 21.
+$CFG_CI_REESTIMATE_AFTER_MULTIPRON = 'no';
+
+# Stages 21–22 are exercised with continuous ($CFG_HMM_TYPE eq '.cont.') models; semi and
+# PTM use a different training/alignment path, so keep them off unless you are on .cont..
+if ($CFG_HMM_TYPE ne '.cont.') {
+  $CFG_MULTIPRON = 'no';
+  $CFG_CI_REESTIMATE_AFTER_MULTIPRON = 'no';
+}
+
 # Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl).
 $CFG_VOCAB_DICT = 'no';
 $CFG_VOCAB_DICTIONARY = "$CFG_LIST_DIR/$CFG_EXPTNAME.vocab.dic";

From 29d5096438168ee4076283d362fae87d38289252 Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 15:20:45 -0400
Subject: [PATCH 11/12] Add scripts/22.ci_hmm_multipron for install target

CMake installs SCRIPTDIRS including 22.ci_hmm_multipron; the directory was
listed but not in the tree on CI, so cmake --install failed.
---
 scripts/22.ci_hmm_multipron/slave_convg.pl | 36 ++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100755 scripts/22.ci_hmm_multipron/slave_convg.pl

diff --git a/scripts/22.ci_hmm_multipron/slave_convg.pl b/scripts/22.ci_hmm_multipron/slave_convg.pl
new file mode 100755
index 00000000..84ae97aa
--- /dev/null
+++ b/scripts/22.ci_hmm_multipron/slave_convg.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/env perl
+## Optional second CI training pass after multipron (stage 21). When enabled, runs the
+## same driver as stage 20; Baum-Welch uses GetLists(), which selects the multipron
+## transcript once $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription
+## exists. This re-runs flat initialization and replaces the CI model directory (same as
+## a fresh stage 20). Set $CFG_CI_REESTIMATE_AFTER_MULTIPRON to 'no' (default) to skip.
+
+use strict;
+use warnings;
+use File::Basename qw(dirname);
+use File::Spec::Functions qw(catfile updir);
+
+use lib catfile(dirname($0), updir(), 'lib');
+use SphinxTrain::Config;
+use SphinxTrain::Util;
+
+$| = 1;
+Log("MODULE: 22 Optional second CI pass (multipron supervision)\n");
+
+if (!defined $ST::CFG_CI_REESTIMATE_AFTER_MULTIPRON
+    || $ST::CFG_CI_REESTIMATE_AFTER_MULTIPRON ne 'yes') {
+    Log("Skipped (set \$CFG_CI_REESTIMATE_AFTER_MULTIPRON to 'yes' to enable)\n");
+    exit 0;
+}
+if (!ShouldUseMultipronTranscript()) {
+    LogError(
+        "CFG_CI_REESTIMATE_AFTER_MULTIPRON is yes but multipron transcript is not available. "
+            . "Keep \$CFG_MULTIPRON enabled, complete stage 21, or set "
+            . "CFG_CI_REESTIMATE_AFTER_MULTIPRON to no.\n"
+    );
+    exit 1;
+}
+
+my $target = catfile(dirname($0), updir(), '20.ci_hmm', 'slave_convg.pl');
+Log("Invoking $target (same as stage 20; supervision = multipron transcript via GetLists)\n");
+exec $^X, $target, @ARGV or die "exec $target: $!\n";

From 321fd173feb7dd8c42fd0952dbc9ad4dba96acca Mon Sep 17 00:00:00 2001
From: Kevin Lenzo <lenzo@duolingo.com>
Date: Wed, 8 Apr 2026 15:22:24 -0400
Subject: [PATCH 12/12] Update Util

---
 scripts/lib/SphinxTrain/Util.pm | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/lib/SphinxTrain/Util.pm b/scripts/lib/SphinxTrain/Util.pm
index ce08e4f1..e4e90a88 100644
--- a/scripts/lib/SphinxTrain/Util.pm
+++ b/scripts/lib/SphinxTrain/Util.pm
@@ -355,8 +355,10 @@ sub RunTool {
         $returnvalue = 1;
         last;
       }
-      $error_count++ if m/(ERROR).*/;
-      $warning_count++ if m/(WARNING).*/;
+      # Match Sphinx-style log lines only. A bare /ERROR/ false-positives on words like
+      # "TERROR" in transcripts; real messages are "ERROR: ..." (see sphinxbase err_msg).
+      $error_count++ if m/^\s*ERROR:/;
+      $warning_count++ if m/^\s*WARN(?:ING)?:/;
       if ($ctl_counter) {
 	# Keep track of progress being made.
 	$processed_counter++  if (/.*(utt\>).*/);