From e22a6ae8776728e84f340065428983956237fc0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miroslav=20Such=C3=BD?= <msuchy@redhat.com>
Date: Fri, 26 Dec 2025 22:04:17 +0100
Subject: [PATCH] support for Czech

---
 README.md    |   3 +
 misaki/cs.py | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100644 misaki/cs.py

diff --git a/README.md b/README.md
index 211a322..7382156 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,9 @@ The first gen Chinese tokenizer uses jieba to cut, pypinyin, and pinyin-to-ipa.
 ### Vietnamese
 - https://github.com/v-nhandt21/Viphoneme
 
+### Czech
+First version of Czech tokenizer.
+
 ### TODO
 - [ ] Data: Compress [data](https://github.com/hexgrad/misaki/tree/main/misaki/data) (no need for indented json) and eliminate redundancy between gold and silver dictionaries.
 - [ ] Fallbacks: Train seq2seq fallback models on dictionaries using [this notebook](https://github.com/Kyubyong/nlp_made_easy/blob/master/PyTorch%20seq2seq%20template%20based%20on%20the%20g2p%20task.ipynb).
diff --git a/misaki/cs.py b/misaki/cs.py
new file mode 100644
index 0000000..6bb4f6e
--- /dev/null
+++ b/misaki/cs.py
@@ -0,0 +1,206 @@
+#  SPDX-License-Identifier: Apache-2.0
+
+# Grapheme to Phoneme for Czech language.
+# Originaly developed by Richard Mazur
+# https://github.com/essare-rimaz/grapheme_to_phoneme_CZ/blob/main/server.r
+# Later converted to Python by Miroslav Suchy <msuchy@redhat.com> with
+# assistence of AI. And with permission of Richard released under Apache-2.0
+# license.
+
+# Czech Phonology
+# https://cs.wikipedia.org/wiki/Fonologie_%C4%8De%C5%A1tiny
+
+
+from typing import Dict, List, Optional
+from .token import MToken
+
+IPA: Dict[str, str] = {
+    "a": "a", "á": "aː", "b": "b", "c": "t͡s", "č": "t͡ʃ", "d": "d", "ď": "ɟ",
+    "e": "ɛ", "é": "ɛː", "ě": "ě", "f": "f", "g": "ɡ", "h": "ɦ", "ch": "x",
+    "i": "ɪ", "í": "iː", "j": "j", "k": "k", "l": "l", "m": "m", "n": "n",
+    "ň": "ň", "o": "o", "ó": "oː", "p": "p", "q": "k", "r": "r", "s": "s",
+    "š": "ʃ", "t": "t", "ť": "c", "u": "u", "ú": "uː", "ů": "uː", "v": "v",
+    "w": "w", "x": "ks", "y": "ɪ", "ý": "iː", "z": "z", "ž": "ʒ",
+    "di": "ɟɪ", "dí": "ɟiː", "dě": "ɟɛ",
+    "ti": "cɪ", "tí": "ciː", "tě": "cɛ",
+    "ni": "ɲɪ", "ní": "ɲiː", "ně": "ɲɛ",
+    "mě": "mɲɛ", "bě": "bjɛ", "pě": "pjɛ", "vě": "vjɛ",
+    "ts": "t͡s", "dz": "d͡z",
+    "ie": "ɪjɛ", "ia": "ɪja", "io": "ɪjo",
+    "ř": "r̝",
+}
+
+TEMP: Dict[str, str] = {
+    "a": "a", "á": "á", "b": "b", "c": "c", "č": "č", "d": "d", "ď": "ď",
+    "e": "e", "é": "é", "ě": "ě", "f": "f", "g": "g", "h": "h", "ch": "ch",
+    "i": "i", "í": "í", "j": "j", "k": "k", "l": "l", "m": "m", "n": "n",
+    "ň": "ň", "o": "o", "ó": "ó", "p": "p", "q": "q", "r": "r", "ř": "ř",
+    "s": "s", "š": "š", "t": "t", "ť": "ť", "u": "u", "ú": "ú", "ů": "ů",
+    "v": "v", "w": "w", "x": "x", "y": "y", "ý": "ý", "z": "z", "ž": "ž",
+    "di": "di", "dí": "dí", "dě": "dě",
+    "ti": "ti", "tí": "tí", "tě": "tě",
+    "ni": "ni", "ní": "ní", "ně": "ně",
+    "mě": "mě", "bě": "bě", "pě": "pě", "vě": "vě",
+    "dz": "dz", "ts": "ts", "ie": "ie", "ia": "ia", "io": "io",
+    " ": " ",
+}
+
+PAIRED_CONSONANTS: Dict[str, str] = {
+    "b": "p", "d": "t", "ď": "ť", "g": "k", "v": "f", "z": "s", "ž": "š",
+    "ch": "h", "dz": "c", "dž": "č",
+    "p": "b", "t": "d", "ť": "ď", "k": "g", "f": "v", "s": "z", "š": "ž",
+    "h": "ch", "c": "dz", "č": "dž",
+}
+
+PAIRED_UNVOICED: Dict[str, str] = {
+    "p": "p", "t": "t", "ť": "ť", "k": "k", "f": "f", "s": "s", "š": "š",
+    "ch": "ch", "c": "c", "č": "č",
+}
+
+PAIRED_VOICED: Dict[str, str] = {
+    "b": "b", "d": "d", "ď": "ď", "g": "g", "v": "v", "z": "z", "ž": "ž",
+    "dz": "dz", "dž": "dž",
+}
+
+DTN = {"d": "d", "t": "t", "n": "n"}
+DTN_VOCAL = {"í": "í", "i": "i", "ě": "ě"}
+
+MBPV = {"m": "m", "b": "b", "p": "p", "v": "v"}
+MBPV_VOCAL = {"ě": "ě"}
+
+CH_FIRST = {"c": "c"}
+CH_SECOND = {"h": "h"}
+
+TS_FIRST = {"t": "t"}
+TS_SECOND = {"s": "s"}
+
+DZ_FIRST = {"d": "d"}
+DZ_SECOND = {"z": "z"}
+
+IEIAIO_FIRST = {"i": "i"}
+IEIAIO_SECOND = {"e": "e", "a": "a", "o": "o"}
+
+
+def _indices_where_in(v: List[Optional[str]], keyset: Dict[str, str]) -> List[int]:
+    """Mimics R: which(v %in% some_named_vector)."""
+    s = set(keyset.keys())
+    return [i for i, x in enumerate(v) if x in s]
+
+class CSG2P:
+    """ Grapheme to Phoneme for Czech language. """
+
+    def __call__(self, text: str) -> Tuple[str, List[MToken]]:
+        """
+        Returns IPA string.
+        """
+        if text is None:
+            return ""
+
+        text = text.lower()
+        text_split = list(text)
+
+        result: List[Optional[str]] = []
+        for ch in text_split:
+            result.append(TEMP.get(ch))  # missing -> None (R NA)
+
+        # 1) i followed by e/a/o => ie/ia/io
+        for x in _indices_where_in(result, IEIAIO_FIRST):
+            y = x + 1
+            if y < len(result):
+                z = result[y]
+                if z is not None and z in IEIAIO_SECOND:
+                    result[x] = (result[x] or "") + (result[y] or "")
+                    result[y] = None
+
+        # 2) d + z => dz
+        for x in _indices_where_in(result, DZ_FIRST):
+            y = x + 1
+            if y < len(result):
+                z = result[y]
+                if z is not None and z in DZ_SECOND:
+                    result[x] = (result[x] or "") + (result[y] or "")
+                    result[y] = None
+
+        # 3) t + s => ts
+        for x in _indices_where_in(result, TS_FIRST):
+            y = x + 1
+            if y < len(result):
+                z = result[y]
+                if z is not None and z in TS_SECOND:
+                    result[x] = (result[x] or "") + (result[y] or "")
+                    result[y] = None
+
+        # 4) voicing assimilation: unvoiced before voiced => swap current with its pair
+        for x in _indices_where_in(result, PAIRED_UNVOICED):
+            y = x + 1
+            if y < len(result):
+                z = result[y]
+                if z is not None and z in PAIRED_VOICED:
+                    w = result[x]
+                    if w is not None and w in PAIRED_CONSONANTS:
+                        result[x] = PAIRED_CONSONANTS[w]
+
+        # 5) voicing assimilation: voiced before unvoiced => swap current with its pair
+        for x in _indices_where_in(result, PAIRED_VOICED):
+            y = x + 1
+            if y < len(result):
+                z = result[y]
+                if z is not None and z in PAIRED_UNVOICED:
+                    w = result[x]
+                    if w is not None and w in PAIRED_CONSONANTS:
+                        result[x] = PAIRED_CONSONANTS[w]
+
+        # 6) c + h => ch
+        for x in _indices_where_in(result, CH_FIRST):
+            y = x + 1
+            if y < len(result):
+                z = result[y]
+                if z is not None and z in CH_SECOND:
+                    result[x] = (result[x] or "") + (result[y] or "")
+                    result[y] = None
+
+        # 7) d/t/n + (i/í/ě) => di/dí/dě, ti/tí/tě, ni/ní/ně
+        for x in _indices_where_in(result, DTN):
+            y = x + 1
+            if y < len(result):
+                z = result[y]
+                if z is not None and z in DTN_VOCAL:
+                    result[x] = (result[x] or "") + (result[y] or "")
+                    result[y] = None
+
+        # 8) m/b/p/v + ě => mě/bě/pě/vě
+        for x in _indices_where_in(result, MBPV):
+            y = x + 1
+            if y < len(result):
+                z = result[y]
+                if z is not None and z in MBPV_VOCAL:
+                    result[x] = (result[x] or "") + (result[y] or "")
+                    result[y] = None
+
+        # Final devoicing: if last symbol is voiced, replace with its pair
+        if result:
+            last_idx = len(result) - 1
+            z = result[last_idx]
+            if z is not None and z in PAIRED_VOICED and z in PAIRED_CONSONANTS:
+                result[last_idx] = PAIRED_CONSONANTS[z]
+
+        # na.omit
+        result_clean: List[str] = [x for x in result if x is not None]
+
+        result_ipa: List[str] = []
+        mtokens: list[MToken] = []
+        for token in result_clean:
+            result_ipa.append(TEMP.get(token, token))
+
+        for i, token in enumerate(result_ipa):
+            result_ipa[i] = IPA.get(token, token)
+            mtokens.append(MToken(token, '', ' ', result_ipa[i]))
+
+        return "".join(result_ipa), mtokens
+
+
+if __name__ == "__main__":
+    examples = ["odzbrojit se", "tsar", "filosofie", "nokia", "rio", "chata", "město", "běh"]
+    g2p = CSG2P()
+    for w in examples:
+        print(w, "->", g2p(w)[0])