Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,6 @@ autom4te.cache
include/stamp-h2
ylwrap
build

# Local design notes (not part of the distributed tree)
docs/plan-multi-pronunciation-iterative-training.md
88 changes: 88 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
cmake_minimum_required(VERSION 3.14) # I like pie

# After an Xcode upgrade, CMakeCache may keep CMAKE_OSX_SYSROOT pointing at a removed
# SDK; the link step then fails (e.g. missing .../usr/lib/libm.tbd). Reset to the
# active SDK when the cached path is missing (must run before project()).
if(APPLE)
set(_sphinxtrain_sysroot_ok FALSE)
if(CMAKE_OSX_SYSROOT AND EXISTS "${CMAKE_OSX_SYSROOT}")
set(_sphinxtrain_sysroot_ok TRUE)
endif()
if(NOT _sphinxtrain_sysroot_ok)
execute_process(
COMMAND xcrun --sdk macosx --show-sdk-path
OUTPUT_VARIABLE _sphinxtrain_osx_sdk
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
RESULT_VARIABLE _sphinxtrain_xcrun_rv
)
if(_sphinxtrain_xcrun_rv EQUAL 0 AND _sphinxtrain_osx_sdk AND EXISTS "${_sphinxtrain_osx_sdk}")
set(CMAKE_OSX_SYSROOT "${_sphinxtrain_osx_sdk}" CACHE PATH "macOS SDK path" FORCE)
message(STATUS "CMAKE_OSX_SYSROOT -> ${CMAKE_OSX_SYSROOT}")
endif()
endif()
endif()

project(SphinxTrain VERSION 5.0.0
DESCRIPTION "CMU Sphinx Trainer"
HOMEPAGE_URL "https://github.com/cmusphinx/sphinxtrain")
Expand Down Expand Up @@ -66,6 +89,71 @@ set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
configure_file(config.h.in config.h)
add_definitions(-DHAVE_CONFIG_H)

# FindBLAS / FindLAPACK cache absolute paths under the SDK; after an Xcode upgrade those
# files may move (e.g. .../MacOSX26.1.sdk/...) while CMAKE_OSX_SYSROOT was fixed above.
# Also drop paths that still exist but live under a different SDK tree than CMAKE_OSX_SYSROOT.
if(APPLE)
set(_sphinxtrain_apple_libcache_vars
BLAS_Accelerate_LIBRARY
BLAS_blas_LIBRARY
BLAS_LIBRARY
LAPACK_Accelerate_LIBRARY
LAPACK_lapack_LIBRARY
LAPACK_LIBRARY
MATH_LIBRARY
)
foreach(_sphinxtrain_v IN LISTS _sphinxtrain_apple_libcache_vars)
if(${_sphinxtrain_v})
set(_sphinxtrain_p "${${_sphinxtrain_v}}")
if(_sphinxtrain_p MATCHES "^/")
set(_sphinxtrain_drop FALSE)
if(NOT EXISTS "${_sphinxtrain_p}")
set(_sphinxtrain_drop TRUE)
elseif(CMAKE_OSX_SYSROOT AND _sphinxtrain_p MATCHES "/[^/]+\\.sdk/")
string(LENGTH "${CMAKE_OSX_SYSROOT}" _sphinxtrain_sl)
if(_sphinxtrain_sl GREATER 0)
string(SUBSTRING "${_sphinxtrain_p}" 0 ${_sphinxtrain_sl} _sphinxtrain_head)
if(NOT _sphinxtrain_head STREQUAL CMAKE_OSX_SYSROOT)
set(_sphinxtrain_drop TRUE)
endif()
endif()
endif()
if(_sphinxtrain_drop)
unset(${_sphinxtrain_v} CACHE)
endif()
endif()
endif()
endforeach()
foreach(_sphinxtrain_listvar BLAS_LIBRARIES LAPACK_LIBRARIES)
if(${_sphinxtrain_listvar})
set(_sphinxtrain_drop_libs FALSE)
foreach(_sphinxtrain_item IN LISTS ${_sphinxtrain_listvar})
if(_sphinxtrain_item MATCHES "^/")
if(NOT EXISTS "${_sphinxtrain_item}")
set(_sphinxtrain_drop_libs TRUE)
elseif(CMAKE_OSX_SYSROOT AND _sphinxtrain_item MATCHES "/[^/]+\\.sdk/")
string(LENGTH "${CMAKE_OSX_SYSROOT}" _sphinxtrain_sl)
if(_sphinxtrain_sl GREATER 0)
string(SUBSTRING "${_sphinxtrain_item}" 0 ${_sphinxtrain_sl} _sphinxtrain_head)
if(NOT _sphinxtrain_head STREQUAL CMAKE_OSX_SYSROOT)
set(_sphinxtrain_drop_libs TRUE)
endif()
endif()
endif()
endif()
endforeach()
if(_sphinxtrain_drop_libs)
unset(${_sphinxtrain_listvar} CACHE)
if(_sphinxtrain_listvar STREQUAL "BLAS_LIBRARIES")
unset(BLAS_FOUND CACHE)
else()
unset(LAPACK_FOUND CACHE)
endif()
endif()
endif()
endforeach()
endif()

# Compile all the things
add_subdirectory(src)

Expand Down
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,30 @@ You do not need to install SphinxTrain to run it, simply run
training directory. Note that you do need to build and install
PocketSphinx for evaluation to work properly, however.

Multipron alignment (optional stage 21)
----------------------------------------

After CI HMM training, the default configuration runs multipron force
alignment so pronunciation-disambiguated transcripts can be written
under `multipron_align/` in your project. This uses the `sphinx3_align`
program built with the rest of the tree (`cmake --build build`).

Set `$CFG_MULTIPRON` to `no` in `etc/sphinx_train.cfg` if you want to
skip stage 21 and use only the original transcripts for later stages.
For **semi** (`.semi.`) and **PTM** (`.ptm.`) models, the template turns
multipron and stage 22 off automatically; they are intended for the
**continuous** (`.cont.`) path.

Optional second CI pass (stage 22)

After multipron (stage 21), you can set `$CFG_CI_REESTIMATE_AFTER_MULTIPRON`
to `yes` to run **stage 22**, which repeats the same CI training driver as
stage 20. Once the multipron transcript exists, `GetLists()` uses it for
Baum–Welch, so this pass trains CI models on pronunciation-disambiguated
text. It performs a **full** CI cycle again (including flat initialization)
and **replaces** the CI model directory, roughly doubling CI time. Default
is `no`.

You can also install SphinxTrain system-wide if you so desire:

sudo cmake --build build --target install
Expand Down Expand Up @@ -138,7 +162,7 @@ procedure is identical to the Unix installation.

Also, check the section title "All Platforms" above.

Acknowldegments
Acknowledgments
---------------

The development of this code has included support at different times
Expand Down
25 changes: 25 additions & 0 deletions etc/sphinx_train.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,31 @@ $CFG_FORCE_ALIGN_MODELDIR = "$CFG_MODEL_DIR/$CFG_EXPTNAME.falign_ci_$CFG_DIRLABE
# rejected for bad alignment.
$CFG_FORCE_ALIGN_BEAM = 1e-60;

# Multipron: after CI, stage 21 runs multipron alignment (sphinx3_align); CD and later
# steps use $CFG_BASE_DIR/multipron_align/$CFG_EXPTNAME.multipron.transcription when present.
# If this variable is missing (old configs), multipron is off. Set to no to skip stage 21
# and use only $CFG_TRANSCRIPTFILE everywhere. Requires
# sphinx3_align in $CFG_BIN_DIR. Keep $CFG_FORCEDALIGN = no unless you also run stock
# 11.force_align. Beam for multipron_align follows $CFG_FORCE_ALIGN_BEAM (else 1e-308).
$CFG_MULTIPRON = 'yes';

# Second CI pass after multipron (stage 22): re-run stage-20 training with supervision
# from the multipron transcript (GetLists selects it once the file from stage 21 exists).
# Re-runs flat init and replaces the CI model directory—roughly doubles CI wall time.
# Default no; requires CFG_MULTIPRON and a successful stage 21.
$CFG_CI_REESTIMATE_AFTER_MULTIPRON = 'no';

# Stages 21–22 are exercised with continuous ($CFG_HMM_TYPE eq '.cont.') models; semi and
# PTM use a different training/alignment path, so keep them off unless you are on .cont..
if ($CFG_HMM_TYPE ne '.cont.') {
$CFG_MULTIPRON = 'no';
$CFG_CI_REESTIMATE_AFTER_MULTIPRON = 'no';
}

# Optional vocabulary-restricted dictionary (scripts/00a.vocab_dict/make_vocab_dict.pl).
$CFG_VOCAB_DICT = 'no';
$CFG_VOCAB_DICTIONARY = "$CFG_LIST_DIR/$CFG_EXPTNAME.vocab.dic";

# Calculate an LDA/MLLT transform?
$CFG_LDA_MLLT = 'no';
# Dimensionality of LDA/MLLT output
Expand Down
83 changes: 83 additions & 0 deletions python/cmusphinx/vocab_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2009 Carnegie Mellon University
#
# You may copy and modify this freely under the same terms as
# Sphinx-III

"""Filter dictionary to vocabulary from transcripts.

Creates a reduced dictionary containing only words that appear in the
training/test transcripts, preserving all pronunciation variants.
"""

import sys
from cmusphinx import s3dict


def load_vocab(vocab_path):
"""Load vocabulary from file (one word per line)."""
vocab = set()
with open(vocab_path) as fh:
for line in fh:
word = line.strip()
if word:
vocab.add(word)
return vocab


def filter_dict(indict, vocab, outfh):
"""Filter dictionary to vocabulary, write to outfh."""
in_words = set(indict.words())
kept = vocab & in_words
unused = in_words - vocab

for w in sorted(kept):
for i, phones in enumerate(indict.alts(w), 1):
if i == 1:
outfh.write("%s %s\n" % (w, " ".join(phones)))
else:
outfh.write("%s(%d) %s\n" % (w, i, " ".join(phones)))

return kept, unused


def main():
if len(sys.argv) < 4:
print("Usage: %s DICT VOCAB OUTDICT" % sys.argv[0], file=sys.stderr)
sys.exit(1)

dict_path, vocab_path, out_path = sys.argv[1:4]

vocab = load_vocab(vocab_path)
indict = s3dict.open(dict_path)
in_words = set(indict.words())

with open(out_path, "w") as outfh:
kept, unused = filter_dict(indict, vocab, outfh)

# Stats
full_prons = sum(indict.maxalt[w] for w in in_words)
kept_prons = sum(indict.maxalt[w] for w in kept)

print("Full dictionary: %7d words" % len(in_words))
print("Transcript vocab: %7d words" % len(vocab))
print("Kept in reduced: %7d words" % len(kept))
print("Removed (unused): %7d words" % len(unused))
print("Full pronunciations: %7d" % full_prons)
print("Kept pronunciations: %7d" % kept_prons)

# Write unused words for tracing
unused_path = out_path.replace(".dic", ".unused")
with open(unused_path, "w") as fh:
for w in sorted(unused):
fh.write("%s\n" % w)
print("Unused words: %s" % unused_path)

# Warn about OOV
oov = vocab - in_words
if oov:
print("OOV words: %7d (missing from dict)" % len(oov),
file=sys.stderr)


if __name__ == "__main__":
main()
78 changes: 78 additions & 0 deletions scripts/00a.vocab_dict/make_vocab_dict.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/perl
## ====================================================================
##
## Copyright (c) 1996-2000 Carnegie Mellon University. All rights
## reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions
## are met:
##
## 1. Redistributions of source code must retain the above copyright
## notice, this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimer in
## the documentation and/or other materials provided with the
## distribution.
##
## This work was supported in part by funding from the Defense Advanced
## Research Projects Agency and the National Science Foundation of the
## United States of America, and the CMU Sphinx Speech Consortium.
##
## THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
## ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
## THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
## PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
## NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
##
## ====================================================================

use strict;
use File::Basename;
use File::Spec::Functions;

use lib catdir(dirname($0), updir(), 'lib');
use SphinxTrain::Config;
use SphinxTrain::Util;

$| = 1; # Turn on autoflushing
Log("MODULE: 00a Create vocabulary-restricted dictionary\n");

if ($ST::CFG_VOCAB_DICT ne "yes") {
Log("Skipped (set \$CFG_VOCAB_DICT = 'yes' to enable)\n");
exit(0);
}

my $logdir = "$ST::CFG_LOG_DIR/00a.vocab_dict";
mkdir($logdir, 0777) unless -d $logdir;
my $logfile = "$logdir/$ST::CFG_EXPTNAME.vocab_dict.log";

my $vocabfile = "$ST::CFG_LIST_DIR/$ST::CFG_EXPTNAME.vocab";
my $outdict = $ST::CFG_VOCAB_DICTIONARY;

unless (-f $vocabfile) {
LogError("Vocabulary file not found: $vocabfile\n");
LogError("Run 00.verify first to generate it\n");
exit(1);
}

Log("Creating vocabulary-restricted dictionary\n");
Log(" Input dict: $ST::CFG_DICTIONARY\n");
Log(" Vocabulary: $vocabfile\n");
Log(" Output dict: $outdict\n");

$ENV{PYTHONPATH} .= ':' . catdir($ST::CFG_SPHINXTRAIN_DIR, 'python');
my $rv = RunTool("python", $logfile, 0,
"-m", "cmusphinx.vocab_dict",
$ST::CFG_DICTIONARY,
$vocabfile,
$outdict);

exit($rv);
39 changes: 39 additions & 0 deletions scripts/11.force_align/multipron_align.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env perl
## Run multipron_align.py using paths from sphinx_train.cfg (project base = $CFG_BASE_DIR).
##
## Usage (from training project directory, same as sphinxtrain run):
## perl path/to/scripts/11.force_align/multipron_align.pl [--dry-run] [--first-n N] [--binary PATH]
##
## See $CFG_MULTIPRON in etc/sphinx_train.cfg (no disables multipron; stage 21 runs this after CI by default).

use strict;
use warnings;
use File::Basename qw(dirname);
use File::Spec::Functions qw(catfile updir);

use lib catfile(dirname($0), updir(), 'lib');
use SphinxTrain::Config;

my @args = @ARGV;
my $dry = (@args && $args[0] eq '--dry-run') ? shift @args : 0;

my $etc = catfile($ST::CFG_BASE_DIR, 'etc');
if (!-d $etc) {
die "Missing directory $etc (run from project after setup, or fix \$CFG_BASE_DIR)\n";
}

my $py = catfile($ST::CFG_SPHINXTRAIN_DIR, 'scripts', '11.force_align', 'multipron_align.py');
if (!-f $py) {
die "Missing $py\n";
}

# Prefer $PYTHON; else python3 on Unix (many images have no `python` symlink); else python.
my $pyexe = $ENV{PYTHON};
if (!defined($pyexe) || $pyexe eq '') {
$pyexe = ($^O eq 'MSWin32') ? 'python' : 'python3';
}
my @py = ($pyexe, $py);
push @py, '--dry-run' if $dry;
push @py, $etc, @args;
exec @py;
die "exec failed: $!\n";
Loading
Loading