|
15 | 15 | ) |
16 | 16 | # re.compile(url, re.U).match("https://website.gov?param=value") |
17 | 17 |
|
18 | | -# Matches Chinese, Japanese and Korean characters. |
19 | | -cjk = ( |
20 | | - r'[' + |
21 | | - r'\u4E00-\u62FF' + # noqa Unified Ideographs |
22 | | - r'\u6300-\u77FF' + |
23 | | - r'\u7800-\u8CFF' + |
24 | | - r'\u8D00-\u9FCC' + |
25 | | - r'\u3400-\u4DFF' + # Unified Ideographs Ext A |
26 | | - r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B |
27 | | - r'\U00021600-\U000230FF' + |
28 | | - r'\U00023100-\U000245FF' + |
29 | | - r'\U00024600-\U000260FF' + |
30 | | - r'\U00026100-\U000275FF' + |
31 | | - r'\U00027600-\U000290FF' + |
32 | | - r'\U00029100-\U0002A6DF' + |
33 | | - r'\uF900-\uFAFF' + # Compatibility Ideographs |
34 | | - r'\U0002F800-\U0002FA1F' + # Compatibility Ideographs Suppl. |
35 | | - r'\u3041-\u3096' + # Hiragana |
36 | | - r'\u30A0-\u30FF' + # Katakana |
37 | | - r'\u3400-\u4DB5' + # Kanji |
38 | | - r'\u4E00-\u9FCB' + |
39 | | - r'\uF900-\uFA6A' + |
40 | | - r'\u2E80-\u2FD5' + # Kanji radicals |
41 | | - r'\uFF5F-\uFF9F' + # Katakana and Punctuation (Half Width) |
42 | | - r'\u31F0-\u31FF' + # Miscellaneous Japanese Symbols and Characters |
43 | | - r'\u3220-\u3243' + |
44 | | - r'\u3280-\u337F' |
45 | | - r']' |
46 | | -) |
47 | | - |
48 | 18 | devangari_word = r'\u0901-\u0963' |
49 | 19 | arabic_word = r'\u0601-\u061A' + \ |
50 | 20 | r'\u061C-\u0669' + \ |
51 | 21 | r'\u06D5-\u06EF' |
52 | 22 | bengali_word = r'\u0980-\u09FF' |
53 | | -combined_word = devangari_word + arabic_word + bengali_word |
| 23 | +korean_word = r'\uac00-\ud7a3' |
| 24 | + |
| 25 | +combined_word = devangari_word + arabic_word + bengali_word + korean_word |
54 | 26 |
|
55 | | -word = r'(?:[^\W\d]|[' + combined_word + r'])' + \ |
| 27 | +cjk_re = r'\u3040-\u30ff' + r'\u4e00-\u9FFF' |
| 28 | + |
| 29 | +cjk = r'[' + cjk_re + ']' |
| 30 | + |
| 31 | +word = r'(?:[^\W\d' + cjk_re + r']|[' + combined_word + r'])' + \ |
56 | 32 | r'[\w' + combined_word + r']*' + \ |
57 | 33 | r'(?:[\'’](?:[\w' + combined_word + r']+|(?=(?:$|\s))))*' |
58 | 34 |
|
|
71 | 47 | ("bold", r"'''"), |
72 | 48 | ("italic", r"''"), |
73 | 49 | ('japan_punct', r'[\u3000-\u303F]'), |
74 | | - ('cjk', cjk), |
75 | 50 | ('word', word), |
76 | 51 | ('tab_open', r'\{\|'), |
77 | 52 | ('tab_close', r'\|\}'), |
|
97 | 72 | ("etc", r"."), |
98 | 73 | ] |
99 | 74 |
|
100 | | -wikitext_split = RegexTokenizer(LEXICON) |
| 75 | +LEXICON_LATIN = LEXICON.copy() |
| 76 | +LEXICON_LATIN.insert(-2, ('cjk', cjk)) |
| 77 | +wikitext_split = RegexTokenizer(LEXICON_LATIN) |
| 78 | + |
| 79 | +LEXICON_CJK = LEXICON.copy() |
| 80 | +LEXICON_CJK.insert(0, ('cjk', cjk)) |
| 81 | +wikitext_split_cjk = RegexTokenizer(LEXICON_CJK) |
0 commit comments