diff --git a/.github/workflows/matrix-tests.yml b/.github/workflows/matrix-tests.yml index 8673ffce..d84b9fc2 100644 --- a/.github/workflows/matrix-tests.yml +++ b/.github/workflows/matrix-tests.yml @@ -27,7 +27,7 @@ jobs: - uses: awalsh128/cache-apt-pkgs-action@acb598e5ddbc6f68a970c5da0688d2f3a9f04d05 # v1.6.0 with: packages: sox libsox-dev - - uses: FedericoCarboni/setup-ffmpeg@583042d32dd1cabb8bd09df03bde06080da5c87c # v2 + - uses: FedericoCarboni/setup-ffmpeg@37062fbf7149fc5578d6c57e08aed62458b375d6 # @v3.1, with tool cache - name: Install dependencies and EveryVoice itself run: | CUDA_TAG=cpu pip install -r requirements.torch.txt --find-links https://download.pytorch.org/whl/torch_stable.html diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6218f321..0faec7d6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,7 +26,7 @@ jobs: packages: sox libsox-dev - name: Verify SoX installation run: sox --version - - uses: FedericoCarboni/setup-ffmpeg@583042d32dd1cabb8bd09df03bde06080da5c87c # v2 + - uses: FedericoCarboni/setup-ffmpeg@37062fbf7149fc5578d6c57e08aed62458b375d6 # @v3.1, with tool cache - uses: actions/setup-python@v6 with: python-version: "3.10" @@ -100,7 +100,7 @@ jobs: - uses: actions/checkout@v6 with: submodules: recursive - - uses: FedericoCarboni/setup-ffmpeg@583042d32dd1cabb8bd09df03bde06080da5c87c # v2 + - uses: FedericoCarboni/setup-ffmpeg@37062fbf7149fc5578d6c57e08aed62458b375d6 # @v3.1, with tool cache - uses: actions/setup-python@v6 with: python-version: "3.10" @@ -170,7 +170,7 @@ jobs: - name: Run license check overall run: | licensecheck --requirements-paths pyproject.toml --zero \ - --ignore-packages text-unidecode pympi-ling pyworld pyworld-prebuilt pysdtw audioread anytree gradio hf-gradio \ + --ignore-packages text-unidecode pympi-ling pyworld pyworld-prebuilt audioread anytree gradio hf-gradio \ --skip-dependencies llvmlite \ --ignore-licenses OTHER/PROPRIETARY || \ ! echo "Package(s) listed with an X above is/are potentially a problem. Please review their licenses for compatibility with EveryVoice." diff --git a/everyvoice/model/aligner/wav2vec2aligner b/everyvoice/model/aligner/wav2vec2aligner index 21043241..170bb748 160000 --- a/everyvoice/model/aligner/wav2vec2aligner +++ b/everyvoice/model/aligner/wav2vec2aligner @@ -1 +1 @@ -Subproject commit 21043241932e94f3176431b7f9e708b7b15a8f2e +Subproject commit 170bb748bfe83c1830befe8e5d5a5996c215a858 diff --git a/everyvoice/tests/test_custom_g2p.py b/everyvoice/tests/test_custom_g2p.py index 8e68fa6a..25f1d47c 100755 --- a/everyvoice/tests/test_custom_g2p.py +++ b/everyvoice/tests/test_custom_g2p.py @@ -95,6 +95,15 @@ def test_basic_g2p(self): with self.assertRaises(NotImplementedError): get_g2p_engine("boop") + def test_unusual_ipa_code(self): + # sal-apa goes to sal-ipa instead of sal-apa-ipa + sal_apa_g2p = get_g2p_engine("sal-apa") + self.assertEqual(sal_apa_g2p("ac"), list("ats")) + + # but iku-sro goes to iku-sro-ipa, not iku-ipa + iku_sro_g2p = get_g2p_engine("iku-sro") + self.assertEqual(iku_sro_g2p("akaq"), list("akaq")) + def test_phonemizer_normalization(self): moh_g2p = get_g2p_engine("moh") self.assertEqual(moh_g2p("\u00e9"), ["\u00e9"]) diff --git a/everyvoice/text/phonemizer.py b/everyvoice/text/phonemizer.py index 86950a9a..31d8b1d9 100644 --- a/everyvoice/text/phonemizer.py +++ b/everyvoice/text/phonemizer.py @@ -13,7 +13,7 @@ DEFAULT_G2P = "DEFAULT_G2P" -def make_default_g2p_engines(): +def make_default_g2p_engines() -> dict[str, str | G2PCallable]: return {k: DEFAULT_G2P for k in get_arpabet_langs()[0]} @@ -32,9 +32,24 @@ def make_default_g2p_engines(): class CachingG2PEngine: """caching tokenizing g2p engine""" - def __init__(self, lang_id): - self._cache = {} - self.phonemizer = make_g2p(lang_id, f"{lang_id}-ipa") + def __init__(self, lang_id: str) -> None: + self._cache: dict[str, list[str]] = {} + self.phonemizer = make_g2p(lang_id, self.get_ipa_code(lang_id)) + + def get_ipa_code(self, lang_id: str) -> str: + """Given a lang ID in get_arpabet_langs()[0], find its IPA language code. + + Most languages in the g2p library have a three letter code lll mapped to + lll-ipa, but a few do not, e.g., sal-apa -> sal-ipa, oji-syl -> oji-ipa + + Copied from g2p.get_ipa_code(), for compatibility with any version of g2p.""" + + from g2p.mappings.langs import LANGS_NETWORK + + if lang_id + "-ipa" in LANGS_NETWORK.nodes: + return lang_id + "-ipa" + else: + return lang_id.split("-", 1)[0] + "-ipa" def process_one_token(self, input_token: str) -> list[str]: """Process one input token, dumbly split on whitespace. @@ -62,7 +77,7 @@ def process_one_token(self, input_token: str) -> list[str]: def __call__(self, normalized_input_text: str) -> list[str]: input_tokens = re.split(r"(\s+)", normalized_input_text) - output_tokens = [] + output_tokens: list[str] = [] for token in input_tokens: cached = self._cache.get(token, None) if cached is None: