Created lexicons for cjk and non-cjk texts

HAKSOAT · HAKSOAT · commit e9bd3552a0de · 2020-06-13T00:19:08.000+01:00
diff --git a/deltas/tokenizers/__init__.py b/deltas/tokenizers/__init__.py
@@ -16,6 +16,7 @@
 from .tokenizer import Tokenizer, RegexTokenizer
 from .token import Token
 from .text_split import text_split
-from .wikitext_split import wikitext_split
+from .wikitext_split import wikitext_split, wikitext_split_cjk
 
-__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split]
+__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split,
+           wikitext_split_cjk]
diff --git a/deltas/tokenizers/tests/test_wikitext_split.py b/deltas/tokenizers/tests/test_wikitext_split.py
@@ -1,6 +1,6 @@
 from nose.tools import eq_
 
-from ..wikitext_split import wikitext_split
+from ..wikitext_split import wikitext_split, wikitext_split_cjk
 
 
 def test_wikitext_split():
@@ -150,6 +150,13 @@ def test_wikitext_split():
         eq_(token, s)
         eq_(token.type, t)
 
+    tokens = list(wikitext_split_cjk.tokenize(input))
+
+    for token, (s, t) in zip(tokens, expected):
+        print(repr(token), (s, t))
+        eq_(token, s)
+        eq_(token.type, t)
+
 
 def test_arabic():
     input = "يرجع الأمويون في نسبهم إلى أميَّة بن عبد شمس من قبيلة قريش."
@@ -185,6 +192,13 @@ def test_arabic():
         eq_(token, s)
         eq_(token.type, t)
 
+    tokens = list(wikitext_split_cjk.tokenize(input))
+
+    for token, (s, t) in zip(tokens, expected):
+        print(repr(token), (s, t))
+        eq_(token, s)
+        eq_(token.type, t)
+
 
 def test_hebrew():
     input = 'דגל קנדה הידוע בכינויו "דגל עלה האדר" (או המייפל) אומץ בשנת ' + \
@@ -242,6 +256,13 @@ def test_hebrew():
         eq_(token, s)
         eq_(token.type, t)
 
+    tokens = list(wikitext_split_cjk.tokenize(input))
+
+    for token, (s, t) in zip(tokens, expected):
+        print(repr(token), (s, t))
+        eq_(token, s)
+        eq_(token.type, t)
+
 
 def test_hindi():
     input = 'वसा अर्थात चिकनाई शरीर को क्रियाशील बनाए रखने मे सहयोग करती है।'
@@ -277,3 +298,10 @@ def test_hindi():
         print(repr(token), (s, t))
         eq_(token, s)
         eq_(token.type, t)
+
+    tokens = list(wikitext_split_cjk.tokenize(input))
+
+    for token, (s, t) in zip(tokens, expected):
+        print(repr(token), (s, t))
+        eq_(token, s)
+        eq_(token.type, t)
diff --git a/deltas/tokenizers/wikitext_split.py b/deltas/tokenizers/wikitext_split.py
@@ -15,44 +15,20 @@
 )
 # re.compile(url, re.U).match("https://website.gov?param=value")
 
-# Matches Chinese, Japanese and Korean characters.
-cjk = (
-    r'[' +
-        r'\u4E00-\u62FF' +  # noqa Unified Ideographs
-        r'\u6300-\u77FF' +
-        r'\u7800-\u8CFF' +
-        r'\u8D00-\u9FCC' +
-        r'\u3400-\u4DFF' +  # Unified Ideographs Ext A
-        r'\U00020000-\U000215FF' +  # Unified Ideographs Ext. B
-        r'\U00021600-\U000230FF' +
-        r'\U00023100-\U000245FF' +
-        r'\U00024600-\U000260FF' +
-        r'\U00026100-\U000275FF' +
-        r'\U00027600-\U000290FF' +
-        r'\U00029100-\U0002A6DF' +
-        r'\uF900-\uFAFF' +  # Compatibility Ideographs
-        r'\U0002F800-\U0002FA1F' +  # Compatibility Ideographs Suppl.
-        r'\u3041-\u3096' +  # Hiragana
-        r'\u30A0-\u30FF' +  # Katakana
-        r'\u3400-\u4DB5' +  # Kanji
-        r'\u4E00-\u9FCB' +
-        r'\uF900-\uFA6A' +
-        r'\u2E80-\u2FD5' +  # Kanji radicals
-        r'\uFF5F-\uFF9F' +  # Katakana and Punctuation (Half Width)
-        r'\u31F0-\u31FF' +  # Miscellaneous Japanese Symbols and Characters
-        r'\u3220-\u3243' +
-        r'\u3280-\u337F'
-    r']'
-)
-
 devangari_word = r'\u0901-\u0963'
 arabic_word = r'\u0601-\u061A' + \
               r'\u061C-\u0669' + \
               r'\u06D5-\u06EF'
 bengali_word = r'\u0980-\u09FF'
-combined_word = devangari_word + arabic_word + bengali_word
+korean_word = r'\uac00-\ud7a3'
+
+combined_word = devangari_word + arabic_word + bengali_word + korean_word
 
-word = r'(?:[^\W\d]|[' + combined_word + r'])' + \
+cjk_re = r'\u3040-\u30ff' + r'\u4e00-\u9FFF'
+
+cjk = r'[' + cjk_re + ']'
+
+word = r'(?:[^\W\d' + cjk_re + r']|[' + combined_word + r'])' + \
        r'[\w' + combined_word + r']*' + \
        r'(?:[\'’](?:[\w' + combined_word + r']+|(?=(?:$|\s))))*'
 
@@ -71,7 +47,6 @@
     ("bold", r"'''"),
     ("italic", r"''"),
     ('japan_punct', r'[\u3000-\u303F]'),
-    ('cjk', cjk),
     ('word', word),
     ('tab_open', r'\{\|'),
     ('tab_close', r'\|\}'),
@@ -97,4 +72,10 @@
     ("etc", r"."),
 ]
 
-wikitext_split = RegexTokenizer(LEXICON)
+LEXICON_LATIN = LEXICON.copy()
+LEXICON_LATIN.insert(-2, ('cjk', cjk))
+wikitext_split = RegexTokenizer(LEXICON_LATIN)
+
+LEXICON_CJK = LEXICON.copy()
+LEXICON_CJK.insert(0, ('cjk', cjk))
+wikitext_split_cjk = RegexTokenizer(LEXICON_CJK)