Skip to content

Commit e9bd355

Browse files
committed
Created lexicons for cjk and non-cjk texts
1 parent 4a939fd commit e9bd355

3 files changed

Lines changed: 47 additions & 37 deletions

File tree

deltas/tokenizers/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from .tokenizer import Tokenizer, RegexTokenizer
1717
from .token import Token
1818
from .text_split import text_split
19-
from .wikitext_split import wikitext_split
19+
from .wikitext_split import wikitext_split, wikitext_split_cjk
2020

21-
__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split]
21+
__all__ = [Tokenizer, RegexTokenizer, Token, text_split, wikitext_split,
22+
wikitext_split_cjk]

deltas/tokenizers/tests/test_wikitext_split.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from nose.tools import eq_
22

3-
from ..wikitext_split import wikitext_split
3+
from ..wikitext_split import wikitext_split, wikitext_split_cjk
44

55

66
def test_wikitext_split():
@@ -150,6 +150,13 @@ def test_wikitext_split():
150150
eq_(token, s)
151151
eq_(token.type, t)
152152

153+
tokens = list(wikitext_split_cjk.tokenize(input))
154+
155+
for token, (s, t) in zip(tokens, expected):
156+
print(repr(token), (s, t))
157+
eq_(token, s)
158+
eq_(token.type, t)
159+
153160

154161
def test_arabic():
155162
input = "يرجع الأمويون في نسبهم إلى أميَّة بن عبد شمس من قبيلة قريش."
@@ -185,6 +192,13 @@ def test_arabic():
185192
eq_(token, s)
186193
eq_(token.type, t)
187194

195+
tokens = list(wikitext_split_cjk.tokenize(input))
196+
197+
for token, (s, t) in zip(tokens, expected):
198+
print(repr(token), (s, t))
199+
eq_(token, s)
200+
eq_(token.type, t)
201+
188202

189203
def test_hebrew():
190204
input = 'דגל קנדה הידוע בכינויו "דגל עלה האדר" (או המייפל) אומץ בשנת ' + \
@@ -242,6 +256,13 @@ def test_hebrew():
242256
eq_(token, s)
243257
eq_(token.type, t)
244258

259+
tokens = list(wikitext_split_cjk.tokenize(input))
260+
261+
for token, (s, t) in zip(tokens, expected):
262+
print(repr(token), (s, t))
263+
eq_(token, s)
264+
eq_(token.type, t)
265+
245266

246267
def test_hindi():
247268
input = 'वसा अर्थात चिकनाई शरीर को क्रियाशील बनाए रखने मे सहयोग करती है।'
@@ -277,3 +298,10 @@ def test_hindi():
277298
print(repr(token), (s, t))
278299
eq_(token, s)
279300
eq_(token.type, t)
301+
302+
tokens = list(wikitext_split_cjk.tokenize(input))
303+
304+
for token, (s, t) in zip(tokens, expected):
305+
print(repr(token), (s, t))
306+
eq_(token, s)
307+
eq_(token.type, t)

deltas/tokenizers/wikitext_split.py

Lines changed: 15 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -15,44 +15,20 @@
1515
)
1616
# re.compile(url, re.U).match("https://website.gov?param=value")
1717

18-
# Matches Chinese, Japanese and Korean characters.
19-
cjk = (
20-
r'[' +
21-
r'\u4E00-\u62FF' + # noqa Unified Ideographs
22-
r'\u6300-\u77FF' +
23-
r'\u7800-\u8CFF' +
24-
r'\u8D00-\u9FCC' +
25-
r'\u3400-\u4DFF' + # Unified Ideographs Ext A
26-
r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B
27-
r'\U00021600-\U000230FF' +
28-
r'\U00023100-\U000245FF' +
29-
r'\U00024600-\U000260FF' +
30-
r'\U00026100-\U000275FF' +
31-
r'\U00027600-\U000290FF' +
32-
r'\U00029100-\U0002A6DF' +
33-
r'\uF900-\uFAFF' + # Compatibility Ideographs
34-
r'\U0002F800-\U0002FA1F' + # Compatibility Ideographs Suppl.
35-
r'\u3041-\u3096' + # Hiragana
36-
r'\u30A0-\u30FF' + # Katakana
37-
r'\u3400-\u4DB5' + # Kanji
38-
r'\u4E00-\u9FCB' +
39-
r'\uF900-\uFA6A' +
40-
r'\u2E80-\u2FD5' + # Kanji radicals
41-
r'\uFF5F-\uFF9F' + # Katakana and Punctuation (Half Width)
42-
r'\u31F0-\u31FF' + # Miscellaneous Japanese Symbols and Characters
43-
r'\u3220-\u3243' +
44-
r'\u3280-\u337F'
45-
r']'
46-
)
47-
4818
devangari_word = r'\u0901-\u0963'
4919
arabic_word = r'\u0601-\u061A' + \
5020
r'\u061C-\u0669' + \
5121
r'\u06D5-\u06EF'
5222
bengali_word = r'\u0980-\u09FF'
53-
combined_word = devangari_word + arabic_word + bengali_word
23+
korean_word = r'\uac00-\ud7a3'
24+
25+
combined_word = devangari_word + arabic_word + bengali_word + korean_word
5426

55-
word = r'(?:[^\W\d]|[' + combined_word + r'])' + \
27+
cjk_re = r'\u3040-\u30ff' + r'\u4e00-\u9FFF'
28+
29+
cjk = r'[' + cjk_re + ']'
30+
31+
word = r'(?:[^\W\d' + cjk_re + r']|[' + combined_word + r'])' + \
5632
r'[\w' + combined_word + r']*' + \
5733
r'(?:[\'’](?:[\w' + combined_word + r']+|(?=(?:$|\s))))*'
5834

@@ -71,7 +47,6 @@
7147
("bold", r"'''"),
7248
("italic", r"''"),
7349
('japan_punct', r'[\u3000-\u303F]'),
74-
('cjk', cjk),
7550
('word', word),
7651
('tab_open', r'\{\|'),
7752
('tab_close', r'\|\}'),
@@ -97,4 +72,10 @@
9772
("etc", r"."),
9873
]
9974

100-
wikitext_split = RegexTokenizer(LEXICON)
75+
LEXICON_LATIN = LEXICON.copy()
76+
LEXICON_LATIN.insert(-2, ('cjk', cjk))
77+
wikitext_split = RegexTokenizer(LEXICON_LATIN)
78+
79+
LEXICON_CJK = LEXICON.copy()
80+
LEXICON_CJK.insert(0, ('cjk', cjk))
81+
wikitext_split_cjk = RegexTokenizer(LEXICON_CJK)

0 commit comments

Comments
 (0)