diff --git a/g2p/__init__.py b/g2p/__init__.py index c7cf077a..020403ea 100644 --- a/g2p/__init__.py +++ b/g2p/__init__.py @@ -1,11 +1,11 @@ """ - Basic init file for g2p module The main entry points for the g2p module are: - make_g2p() to create a mapper from and lang to another - make_tokenizer() to create a tokenizer for a given language - get_arpabet_langs() to get the list of languages with a path to eng-arpabet + - get_ipa_code() to get the name of the canonical IPA lang code for a given lang id Basic Usage: from g2p import make_g2p @@ -222,6 +222,22 @@ def get_arpabet_langs(): return _langs_cache, _lang_names_cache +def get_ipa_code(lang_id: str) -> str: + """Given a lang ID in get_arpabet_langs()[0], find its IPA language code. + + You can import this function from g2p if you set your dependency to g2p as + g2p>2.3.1, but if you want to remain compatible with older versions of g2p, + it is safe to copy it into your code instead. This function has been + confirmed to work for all published versions of g2p>=0.2, and we commit to + keep it working unchanged for all future versions of g2p.""" + from g2p.mappings.langs import LANGS_NETWORK + + if lang_id + "-ipa" in LANGS_NETWORK.nodes: + return lang_id + "-ipa" + else: + return lang_id.split("-", 1)[0] + "-ipa" + + def make_tokenizer(in_lang=None, out_lang=None, tok_path=None) -> BaseTokenizer: """Make the tokenizer for input in language in_lang @@ -254,6 +270,7 @@ def make_tokenizer(in_lang=None, out_lang=None, tok_path=None) -> BaseTokenizer: "NoPath", "Token", "get_arpabet_langs", + "get_ipa_code", "make_g2p", "make_tokenizer", "tokenize_and_map", diff --git a/g2p/mappings/langs/__init__.py b/g2p/mappings/langs/__init__.py index c2bdf48b..9754bd8b 100644 --- a/g2p/mappings/langs/__init__.py +++ b/g2p/mappings/langs/__init__.py @@ -56,6 +56,10 @@ def get_available_mappings(langs: dict) -> list: return mappings_available +# Inadvertently part of the g2p programmatic API because this is not available for +# import elsewhere. Don't change this! The following code must always work: +# from g2p.mappings.langs import LANGS_NETWORK +# nodes: Collection[str] = LANGS_NETWORK.nodes LANGS_NETWORK = load_network() # Making private because it should be imported from g2p.mappings instead _LANGS = load_langs() diff --git a/g2p/mappings/langs/network_lite.py b/g2p/mappings/langs/network_lite.py index 01adb016..efc652eb 100644 --- a/g2p/mappings/langs/network_lite.py +++ b/g2p/mappings/langs/network_lite.py @@ -1,6 +1,7 @@ from collections import deque from typing import ( Any, + Collection, Deque, Dict, Generic, @@ -58,7 +59,7 @@ def add_edges_from(self, edges: Iterable[Tuple[T, T]]): self.add_edge(u, v) @property # read-only - def nodes(self): + def nodes(self) -> Collection[T]: """Return the nodes""" return self._edges.keys() diff --git a/g2p/mappings/langs/utils.py b/g2p/mappings/langs/utils.py index dd7c76c8..a837bcb5 100644 --- a/g2p/mappings/langs/utils.py +++ b/g2p/mappings/langs/utils.py @@ -202,9 +202,10 @@ def network_to_echart(outfile: Optional[str] = None, layout: bool = False): ), ) size = round(size, 2) - node = {"name": node, "symbolSize": size, "id": node, "category": lang_name} - nodes.append(node) - nodes.sort(key=lambda x: x["name"]) + nodes.append( + {"name": node, "symbolSize": size, "id": node, "category": lang_name} + ) + nodes.sort(key=lambda x: x["name"]) # type: ignore edges = [] for edge in LANGS_NETWORK.edges: edges.append({"source": edge[0], "target": edge[1]}) diff --git a/g2p/tests/test_langs.py b/g2p/tests/test_langs.py index f5266fe1..62d7cf76 100755 --- a/g2p/tests/test_langs.py +++ b/g2p/tests/test_langs.py @@ -1,49 +1,95 @@ #!/usr/bin/env python import sys -from unittest import TestCase +from typing import Collection from pytest import main -from g2p import make_g2p +from g2p import get_arpabet_langs, get_ipa_code, make_g2p from g2p.log import LOGGER +from g2p.mappings.langs import LANGS_NETWORK from g2p.tests.public.data import load_public_test_data -class LangTest(TestCase): +def test_io() -> None: """Basic Test for individual lookup tables. Test files (in g2p/tests/public/data) are either .csv, .psv, or .tsv files, the only difference being the delimiter used (comma, pipe, or tab). - Each line in the test file consists of SOURCE,TARGET,INPUT,OUTPUT - - """ - - def test_io(self): - langs_to_test = load_public_test_data() - - # go through each language declared in the test case set up - # Instead of asserting immediately, we go through all the cases first, so that - # running test_langs.py prints all the errors at once, to help debugging a given g2p mapping. - # Then we call assertEqual on the first failed case, to make unittest register the failure. - error_count = 0 - error_prefix = "test_langs.py: mapping error" - for test in langs_to_test: - transducer = make_g2p(test[0], test[1]) - output_string = transducer(test[2]).output_string.strip() - if output_string != test[3].strip(): - LOGGER.error( - f"{error_prefix} for {test[-1]}: {test[2]} from {test[0]} to {test[1]} should be {test[3]}, got {output_string}" - ) - error_count += 1 - - self.assertEqual( - error_count, - 0, - f'Search for "ERROR - {error_prefix}" above to find all the g2p mapping errors.', - ) + Each line in the test files consist of SOURCE,TARGET,INPUT,OUTPUT""" + langs_to_test = load_public_test_data() + + # go through each language declared in the test case set up + # Instead of asserting immediately, we go through all the cases first, so that + # running test_langs.py prints all the errors at once, to help debugging a given g2p mapping. + # Then we call assertEqual on the first failed case, to make unittest register the failure. + error_count = 0 + error_prefix = "test_langs.py: mapping error" + for test in langs_to_test: + transducer = make_g2p(test[0], test[1]) + output_string = transducer(test[2]).output_string.strip() + if output_string != test[3].strip(): + LOGGER.error( + f"{error_prefix} for {test[-1]}: {test[2]} from {test[0]} to {test[1]} should be {test[3]}, got {output_string}" + ) + error_count += 1 + + assert ( + error_count == 0 + ), f'g2p mapping errors found, look for "{error_prefix}" above for detail.' + + +def test_ipa_heuristic(subtests) -> None: + """Make sure we have a reliable heuristic for finding the IPA code for all langs. + + In EveryVoice, we want to be able to assume that a simple heuristic works to find + the IPA language code for a given language code, so let's exercise this heuristic + here and thus make sure it will always work. + + The first heuristic was lang_id + "-ipa" was the IPA code, but that breaks with + sal-apa -> sal-ipa and oji-syl -> oji-ipa. + A mostly correct heuristic is lang_id.split("-",1)[0]+"-ipa", but this fails for + iku-sro -> iku-sro-ipa, since iku-ipa exists but there is no path from iku-sro + to iku-ipa. + So the correct heuristic is: + 1) try lang_id + "-ipa" and use it if it is in LANGS_NETWORK.nodes + 2) otherwise use lang_id.split("-",1)[0] + "-ipa" + Sigh...""" + + def locked_get_ipa_code(lang_id: str) -> str: + # Prevent inadvertent changes to g2p.get_ipa_code with this locked test copy, + # including this deep import which we promise will keep working. + from g2p.mappings.langs import LANGS_NETWORK + + if lang_id + "-ipa" in LANGS_NETWORK.nodes: + return lang_id + "-ipa" + else: + return lang_id.split("-", 1)[0] + "-ipa" + + # Make sure client code can assume "lang_id in nodes" will work + nodes: Collection[str] = LANGS_NETWORK.nodes + assert isinstance(nodes, Collection) + + langs, _ = get_arpabet_langs() + + for lang in langs: + with subtests.test(lang=lang): + ipa_code = get_ipa_code(lang) + assert ipa_code == locked_get_ipa_code(lang) + assert ipa_code in LANGS_NETWORK.nodes + assert LANGS_NETWORK.has_path(lang, ipa_code) + + for hypothetical_lang, ref_ipa_code in ( + ("ll-foo", "ll-ipa"), + ("lll-bar", "lll-ipa"), + ("lang-foo", "lang-ipa"), + ("language-bar", "language-ipa"), + ("lang", "lang-ipa"), + ("lll-foo-bar-baz", "lll-ipa"), + ): + assert get_ipa_code(hypothetical_lang) == ref_ipa_code if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 0b9e2667..a6c358ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ test = [ "jsonschema>=4.17.3", "pep440>=0.1.2", "pytest", + "pytest-subtests; python_version < '3.10'", "httpx", # Kind of bogus that we need both httpx and aiohttp, but socketio # wants this