Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/matrix-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
- uses: awalsh128/cache-apt-pkgs-action@acb598e5ddbc6f68a970c5da0688d2f3a9f04d05 # v1.6.0
with:
packages: sox libsox-dev
- uses: FedericoCarboni/setup-ffmpeg@583042d32dd1cabb8bd09df03bde06080da5c87c # v2
- uses: FedericoCarboni/setup-ffmpeg@37062fbf7149fc5578d6c57e08aed62458b375d6 # @v3.1, with tool cache
- name: Install dependencies and EveryVoice itself
run: |
CUDA_TAG=cpu pip install -r requirements.torch.txt --find-links https://download.pytorch.org/whl/torch_stable.html
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
packages: sox libsox-dev
- name: Verify SoX installation
run: sox --version
- uses: FedericoCarboni/setup-ffmpeg@583042d32dd1cabb8bd09df03bde06080da5c87c # v2
- uses: FedericoCarboni/setup-ffmpeg@37062fbf7149fc5578d6c57e08aed62458b375d6 # @v3.1, with tool cache
- uses: actions/setup-python@v6
with:
python-version: "3.10"
Expand Down Expand Up @@ -100,7 +100,7 @@ jobs:
- uses: actions/checkout@v6
with:
submodules: recursive
- uses: FedericoCarboni/setup-ffmpeg@583042d32dd1cabb8bd09df03bde06080da5c87c # v2
- uses: FedericoCarboni/setup-ffmpeg@37062fbf7149fc5578d6c57e08aed62458b375d6 # @v3.1, with tool cache
- uses: actions/setup-python@v6
with:
python-version: "3.10"
Expand Down Expand Up @@ -170,7 +170,7 @@ jobs:
- name: Run license check overall
run: |
licensecheck --requirements-paths pyproject.toml --zero \
--ignore-packages text-unidecode pympi-ling pyworld pyworld-prebuilt pysdtw audioread anytree gradio hf-gradio \
--ignore-packages text-unidecode pympi-ling pyworld pyworld-prebuilt audioread anytree gradio hf-gradio \
--skip-dependencies llvmlite \
--ignore-licenses OTHER/PROPRIETARY || \
! echo "Package(s) listed with an X above is/are potentially a problem. Please review their licenses for compatibility with EveryVoice."
Expand Down
2 changes: 1 addition & 1 deletion everyvoice/model/aligner/wav2vec2aligner
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

needs to rebase onto main

9 changes: 9 additions & 0 deletions everyvoice/tests/test_custom_g2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,15 @@ def test_basic_g2p(self):
with self.assertRaises(NotImplementedError):
get_g2p_engine("boop")

def test_unusual_ipa_code(self):
# sal-apa goes to sal-ipa instead of sal-apa-ipa
sal_apa_g2p = get_g2p_engine("sal-apa")
self.assertEqual(sal_apa_g2p("ac"), list("ats"))

# but iku-sro goes to iku-sro-ipa, not iku-ipa
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, why does this not go to iku-ipa?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we kind of assumed this *-ipa convention that isn't enforced

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

g2p show-mappings | grep iku will tell you that we have iku->iku-equiv->iku-ipa->eng-ipa as the path from syllabics, and the path iku-sro->iku-sri-ipa->iku-sro-ipa->eng-ipa for romanized, and those two paths are just not connected. I don't know why we made the choice, but since we never had an official policy or way to declare "this is the IPA code for language X", whoever wrote the mapping thought that was intuitive to them.

Actually, the git logs tell me that's from back in 2019, with a commit log "first attempt at consolidating langs", so I'm going to guess this might have been an artefact of the merging process. We could change things in g2p, and probably we should add a function to the API that returns the IPA code for any non-IPA code that leads to IPA in a way or another. But my problem would remain that any such solution would be future only, it would not be compatible with older versions of g2p, hence my solution here.

iku_sro_g2p = get_g2p_engine("iku-sro")
self.assertEqual(iku_sro_g2p("akaq"), list("akaq"))

def test_phonemizer_normalization(self):
moh_g2p = get_g2p_engine("moh")
self.assertEqual(moh_g2p("\u00e9"), ["\u00e9"])
Expand Down
25 changes: 20 additions & 5 deletions everyvoice/text/phonemizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
DEFAULT_G2P = "DEFAULT_G2P"


def make_default_g2p_engines():
def make_default_g2p_engines() -> dict[str, str | G2PCallable]:
return {k: DEFAULT_G2P for k in get_arpabet_langs()[0]}


Expand All @@ -32,9 +32,24 @@ def make_default_g2p_engines():
class CachingG2PEngine:
"""caching tokenizing g2p engine"""

def __init__(self, lang_id):
self._cache = {}
self.phonemizer = make_g2p(lang_id, f"{lang_id}-ipa")
def __init__(self, lang_id: str) -> None:
self._cache: dict[str, list[str]] = {}
self.phonemizer = make_g2p(lang_id, self.get_ipa_code(lang_id))

def get_ipa_code(self, lang_id: str) -> str:
"""Given a lang ID in get_arpabet_langs()[0], find its IPA language code.

Most languages in the g2p library have a three letter code lll mapped to
lll-ipa, but a few do not, e.g., sal-apa -> sal-ipa, oji-syl -> oji-ipa

Copied from g2p.get_ipa_code(), for compatibility with any version of g2p."""

from g2p.mappings.langs import LANGS_NETWORK

if lang_id + "-ipa" in LANGS_NETWORK.nodes:
return lang_id + "-ipa"
else:
return lang_id.split("-", 1)[0] + "-ipa"

def process_one_token(self, input_token: str) -> list[str]:
"""Process one input token, dumbly split on whitespace.
Expand Down Expand Up @@ -62,7 +77,7 @@ def process_one_token(self, input_token: str) -> list[str]:

def __call__(self, normalized_input_text: str) -> list[str]:
input_tokens = re.split(r"(\s+)", normalized_input_text)
output_tokens = []
output_tokens: list[str] = []
for token in input_tokens:
cached = self._cache.get(token, None)
if cached is None:
Expand Down
Loading