From 52604ac27430aafdab1891bb7942270531a96844 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 12 Dec 2020 07:48:02 +0700 Subject: [PATCH 1/9] Initial rule for Thai sentences --- src/rules/th.toml | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 src/rules/th.toml diff --git a/src/rules/th.toml b/src/rules/th.toml new file mode 100644 index 00000000..58b777bd --- /dev/null +++ b/src/rules/th.toml @@ -0,0 +1,36 @@ +min_trimmed_length = 3 +min_word_count = 1 +max_word_count = 14 +min_characters = 0 +may_end_with_colon = false +quote_start_with_letter = true +needs_punctuation_end = false +needs_letter_start = true +needs_uppercase_start = false +disallowed_symbols = [ + '«', '»', '”', 'º', '&', + '<', '>', '+', '*', '\', '#', '@', '^', '[', ']', '(', ')', '/', + 'ä', 'ö', 'ü', 'ß','ð','ð', + 'À','Á','Â','Ã','Ä','Å','Æ','Ç','È','É','Ê','Ë','Ì','Í','İ','Î','Ï','Ð','Ñ','Ò','Ó','Ô','Õ','Ö','Ø','Ù','Ú','Û','Û','Ü','Ý','Ž', + 'à','á','â','ã','å','æ','ç','è','é','ê','ë','ì','í','î','ï','ð','ñ','ò','ó','ô','õ', + 'ø','ù','ú','û','ý','þ','ÿ', + 'ā','ă','ą','ć','ċ','č','ď','đ','ē','ĕ','ė','ę','ě','ğ','ġ','ģ','ħ','ĩ','ī','ĭ','į','ı','ķ','ĸ','ĺ','ļ', + 'ľ','ŀ','ł','ń','ņ','ņ','ṫ','š','Ў', + 'ḃ','ḋ','ḟ','ṁ','ṗ','ṡ','ẁ','ẃ','ẅ','ẛ','ỳ', + 'α', 'β', 'Γ', 'γ', 'Δ', 'δ', 'ε', 'ζ', 'η', 'Θ', 'θ', 'ι', 'κ', + 'Λ', 'λ', 'μ', 'ν', 'Ξ', 'ξ', 'Π', 'π', 'ρ', 'Σ', 'σ', 'ς', 'τ', + 'υ', 'Φ', 'φ', 'χ', 'Ψ', 'ψ', 'Ω', 'ω', + 'っ','ł', '≤','蘇','ा','≥','ि','җ','پ','국','ी','ň','ก','ō','े', + 'ŏ','×','ő','œ','銭','्','旗','а','瓊','б','名','ҫ','в','ř','飛','어','г','д','ǔ', + 'ś','厘','е','♆','ж','建','†','з','●','∅','и','ş','•','|','福','й','海', + 'к','∈','л','⊂','м','н','…','→','日','ť','о','п','ə','พ','ℝ','р','៛','с','龍','ū','т', + 'у','ㇺ','ф','ů','₫','わ','ч','ű','ш','щ','ک','ъ','ы','ь','脂','龙','凍','ю','∞','ӈ', + 'я','ط','ط','来','゚','ა','₷','ż','ع','脈','є','ž','=','ă','დ','吴','і','ე','ї', + 'ვ','∩','თ','♭','粉','ი','∪','კ','玉','赤','ლ','�','č','პ','ә','孙','曽','რ','ḥ','3','ن','ه','卍', + 'و','უ','א','ي','ב','ქ','明','⟨','ה','ו','백','ט','发','י','ی','क','한','כ','ө','有','§', + 'ե','温','ª','키','_','白','ș','°','ड','ț','ק','ר','ツ','ת','द','口','石','泉','·','न','紫', + 'प','山','ề','藻','य','վ','す','騨','ệ','木','व','记','天','青','{','ह','|','安','위','}','竜','~', + 'ฯ', 'ๆ', '๏', '๚', '๛', +] +broken_whitespace = [" ", " ,", " .", " ;"] +abbreviation_patterns = ["[A-Z]{2,}|[A-Z]+\\.*[A-Z]+|[ก-ฮ]+\\.([ก-ฮ]+\.)+"] From 1cb7a927c83f48dfdc0fc991d5602ce50c33422c Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 12 Dec 2020 07:57:53 +0700 Subject: [PATCH 2/9] Update th.toml --- src/rules/th.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rules/th.toml b/src/rules/th.toml index 58b777bd..6ddbf946 100644 --- a/src/rules/th.toml +++ b/src/rules/th.toml @@ -33,4 +33,4 @@ disallowed_symbols = [ 'ฯ', 'ๆ', '๏', '๚', '๛', ] broken_whitespace = [" ", " ,", " .", " ;"] -abbreviation_patterns = ["[A-Z]{2,}|[A-Z]+\\.*[A-Z]+|[ก-ฮ]+\\.([ก-ฮ]+\.)+"] +abbreviation_patterns = ["[A-Z]{2,}|[A-Z]+\\.*[A-Z]+|[ก-ฮ]+\\.([ก-ฮ]+\\.)+"] From 736d4aa36084840d783a659e68976cf1f16f8bed Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 15 Apr 2021 10:20:02 +0700 Subject: [PATCH 3/9] Add/adjust more rules for Thai --- src/rules/th.toml | 156 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 129 insertions(+), 27 deletions(-) diff --git a/src/rules/th.toml b/src/rules/th.toml index 6ddbf946..005e3545 100644 --- a/src/rules/th.toml +++ b/src/rules/th.toml @@ -1,36 +1,138 @@ +# "other_patterns" borrowing these patterns: +# BEGIN_REGEX, END_REGEX, STRUCTURE_REGEX, and ABBREVIATION_REGEX +# with few adjustments, from: +# https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/languages/th.js +# +# "replacements" borrowing normalization/cleanup patterns, +# with some adjustments, from: +# https://github.com/common-voice/sentence-collector/blob/main/server/lib/cleanup/languages/th.js + min_trimmed_length = 3 +# Currently, without Thai word tokenization, +# this "word" will work effectively for Thai +# as "a group of character whitespace/punctuation". min_word_count = 1 -max_word_count = 14 -min_characters = 0 +max_word_count = 5 +min_characters = 6 may_end_with_colon = false quote_start_with_letter = true needs_punctuation_end = false needs_letter_start = true needs_uppercase_start = false -disallowed_symbols = [ - '«', '»', '”', 'º', '&', - '<', '>', '+', '*', '\', '#', '@', '^', '[', ']', '(', ')', '/', - 'ä', 'ö', 'ü', 'ß','ð','ð', - 'À','Á','Â','Ã','Ä','Å','Æ','Ç','È','É','Ê','Ë','Ì','Í','İ','Î','Ï','Ð','Ñ','Ò','Ó','Ô','Õ','Ö','Ø','Ù','Ú','Û','Û','Ü','Ý','Ž', - 'à','á','â','ã','å','æ','ç','è','é','ê','ë','ì','í','î','ï','ð','ñ','ò','ó','ô','õ', - 'ø','ù','ú','û','ý','þ','ÿ', - 'ā','ă','ą','ć','ċ','č','ď','đ','ē','ĕ','ė','ę','ě','ğ','ġ','ģ','ħ','ĩ','ī','ĭ','į','ı','ķ','ĸ','ĺ','ļ', - 'ľ','ŀ','ł','ń','ņ','ņ','ṫ','š','Ў', - 'ḃ','ḋ','ḟ','ṁ','ṗ','ṡ','ẁ','ẃ','ẅ','ẛ','ỳ', - 'α', 'β', 'Γ', 'γ', 'Δ', 'δ', 'ε', 'ζ', 'η', 'Θ', 'θ', 'ι', 'κ', - 'Λ', 'λ', 'μ', 'ν', 'Ξ', 'ξ', 'Π', 'π', 'ρ', 'Σ', 'σ', 'ς', 'τ', - 'υ', 'Φ', 'φ', 'χ', 'Ψ', 'ψ', 'Ω', 'ω', - 'っ','ł', '≤','蘇','ा','≥','ि','җ','پ','국','ी','ň','ก','ō','े', - 'ŏ','×','ő','œ','銭','्','旗','а','瓊','б','名','ҫ','в','ř','飛','어','г','д','ǔ', - 'ś','厘','е','♆','ж','建','†','з','●','∅','и','ş','•','|','福','й','海', - 'к','∈','л','⊂','м','н','…','→','日','ť','о','п','ə','พ','ℝ','р','៛','с','龍','ū','т', - 'у','ㇺ','ф','ů','₫','わ','ч','ű','ш','щ','ک','ъ','ы','ь','脂','龙','凍','ю','∞','ӈ', - 'я','ط','ط','来','゚','ა','₷','ż','ع','脈','є','ž','=','ă','დ','吴','і','ე','ї', - 'ვ','∩','თ','♭','粉','ი','∪','კ','玉','赤','ლ','�','č','პ','ә','孙','曽','რ','ḥ','3','ن','ه','卍', - 'و','უ','א','ي','ב','ქ','明','⟨','ה','ו','백','ט','发','י','ی','क','한','כ','ө','有','§', - 'ե','温','ª','키','_','白','ș','°','ड','ț','ק','ר','ツ','ת','द','口','石','泉','·','न','紫', - 'प','山','ề','藻','य','վ','す','騨','ệ','木','व','记','天','青','{','ह','|','安','위','}','竜','~', - 'ฯ', 'ๆ', '๏', '๚', '๛', +allowed_symbols_regex = "[0-9\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c‘’‚;:“”\u0020\u0022\u0027\u0060\\-\\.]" +even_symbols = [ + "\u0022", # Quotation mark + "\u0027" # Apostrophe +] +matching_symbols = [ + ["‘", "’"], + ["“", "”"] ] broken_whitespace = [" ", " ,", " .", " ;"] -abbreviation_patterns = ["[A-Z]{2,}|[A-Z]+\\.*[A-Z]+|[ก-ฮ]+\\.([ก-ฮ]+\\.)+"] + +abbreviation_patterns = [ + "[A-Z]{2,}", + "[A-Z]+\\.*[A-Z]+", + "[ก-ฮ]{1,3}\\.([ก-ฮ]{1,3}\\.)+" +] + +# Other patterns +# - No sentences end with period, comma, colon, semicolon, dash +# - No characters immediately after comma, colon, semicolon +# - Missing whitespace after comma, semicolon, etc. +# +# MAX_LENGTH: +# - Sentence longer than 80 characters +# +# BEGIN_REGEX: +# These Thai chars cannot start the word: +# - All vowels except lead vowels +# - Tone marks +# - Phinthu, Thanthakhat, Nikhahit, Yamakkan +# +# END_REGEX: +# These Thai chars cannot end the word: +# - Lead vowels +# +# STRUCTURE_REGEX: +# - 50 running characters or more without space (difficult to read) +# - Very short word following by a space at the beginning of sentence (maybe orphan) +# +# These classes of Thai characters are not allowed to be immediately repeated: +# - Lead vowels: \u0e40\u0e41\u0e42\u0e43\u0e44 +# - Follow vowels: \u0e30\u0e32\u0e33\u0e45 +# - Above vowels: \u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47 +# - Below vowels: \u0e38\u0e39 +# - Tone marks: \u0e48\u0e49\u0e4a\u0e4b +# - Phinthu: \u0e3a +# - Thanthakhat: \u0e4c +# - Nikhahit: \u0e4d +# - Yamakkan: \u0e4e +# +# These classes of Thai characters have a specific legitimate order: +# - Tone marks/Pinthu/Thanthakat/Nikhahit/Yamakkan can't immediately come after lead and follow vowels +# - Tone marks/Pinthu/Thanthakat/Nikhahit/Yamakkan can't immediately come before above and below vowels +other_patterns = [ + "[\\.,:;-]$", + "[,:;]\\S", + "[\\.|\\?|!].+$", + "^.{81,}$", # MAX_LENGTH + "(^|\\s+)[\u0e30\u0e32\u0e33\u0e45\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]", # BEGIN_REGEX + "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)", # END_REGEX + "[\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{55,}", + "^[^ณ]\\s", + "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|และ|หรือ|กับ|ก็]\\s", # orphan + "^\\S{2,3}\\s", + "\\s\\S{1,3}$", + "[\u0e40\u0e41\u0e42\u0e43\u0e44]{2,}", + "[\u0e30\u0e32\u0e33\u0e45]{2,}", + "[\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47]{2,}", + "[\u0e38\u0e39]{2,}", + "[\u0e48\u0e49\u0e4a\u0e4b]{2,}", + "\u0e3A{2,}", + "\u0e4C{2,}", + "\u0e4D{2,}", + "\u0e4E{2,}", + "[\u0e40\u0e41\u0e42\u0e43\u0e44\u0e30\u0e32\u0e33\u0e45][\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]", + "[\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e][\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39]", +] + +# Replacements +# - Remove zero-width characters +# - Expand abbreviations +# - Remove punctuations +# - Normalize character orders +replacements = [ + ["\u200b", ""], # remove zero-width space + ["\u200c", ""], # remove zero-width non-joiner + [" พ.ร.บ.", " พระราชบัญญัติ"], + [" พ.ร.ก.", " พระราชกำหนด"], + [" พ.ศ. ", " พุทธศักราช "], + [" ค.ศ. ", " คริสต์ศักราช "], + [" ม.ร.ว.", " หม่อมราชวงศ์"], + [" .", "."], + [" ,", " "], + [" :", ":"], + [" ;", ";"], + [" !", "!"], + [" ?", "?"], + [":", ": "], + ["?", "? "], + ["!", "! "], + [",", " "], + ["..", " "], + ["...", " "], + ["....", " "], + [" .", "."], + [" ", " "], + [" ", " "], + [" ", " "], + ["\u0e40\u0e40", "\u0e41"], # Sara E + Sara E -> Sara Ae + ["\u0e4d\u0e32", "\u0e33"], # Nikhahit + Sara Aa -> Sara Am + ["\u0e4d\u0e48\u0e32", "\u0e48\u0e33"], # Nikhahit + Mai Ek + Sara Aa -> Sara Am + ["\u0e4d\u0e49\u0e32", "\u0e49\u0e33"], # Nikhahit + Mai Tho + Sara Aa -> Sara Am + ["\u0e4d\u0e4A\u0e32", "\u0e4A\u0e33"], # Nikhahit + Mai Tri + Sara Aa -> Sara Am + ["\u0e4d\u0e4B\u0e32", "\u0e4B\u0e33"], # Nikhahit + Mai Chattawa + Sara Aa -> Sara Am + ["\u0e24\u0e32", "\u0e24\u0e45"], # Ru + Sara Aa -> Ru + Lakkhangyao + ["\u0e26\u0e32", "\u0e26\u0e45"], # Lu + Sara Aa -> Lu + Lakkhangyao +] From decd939fe5d31fa5b000183142c7288a7abfa685 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 15 Apr 2021 10:24:32 +0700 Subject: [PATCH 4/9] Add more orphan word rules --- src/rules/th.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/rules/th.toml b/src/rules/th.toml index 005e3545..bb7b7cea 100644 --- a/src/rules/th.toml +++ b/src/rules/th.toml @@ -57,6 +57,7 @@ abbreviation_patterns = [ # STRUCTURE_REGEX: # - 50 running characters or more without space (difficult to read) # - Very short word following by a space at the beginning of sentence (maybe orphan) +# - Avoid orphan words at the beginning and end of sentence # # These classes of Thai characters are not allowed to be immediately repeated: # - Lead vowels: \u0e40\u0e41\u0e42\u0e43\u0e44 @@ -81,9 +82,10 @@ other_patterns = [ "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)", # END_REGEX "[\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{55,}", "^[^ณ]\\s", - "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|และ|หรือ|กับ|ก็]\\s", # orphan + "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s", # orphan "^\\S{2,3}\\s", "\\s\\S{1,3}$", + "\\s[และ|หรือ|กับ|เช่น]$", # orphan "[\u0e40\u0e41\u0e42\u0e43\u0e44]{2,}", "[\u0e30\u0e32\u0e33\u0e45]{2,}", "[\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47]{2,}", From 6b9030b6100870e20241103fb2927e46eb171fd7 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 16 Apr 2021 12:49:24 +0700 Subject: [PATCH 5/9] Deal with zero-width spaces --- src/rules/th.toml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/rules/th.toml b/src/rules/th.toml index bb7b7cea..25c18181 100644 --- a/src/rules/th.toml +++ b/src/rules/th.toml @@ -81,11 +81,11 @@ other_patterns = [ "(^|\\s+)[\u0e30\u0e32\u0e33\u0e45\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]", # BEGIN_REGEX "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)", # END_REGEX "[\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{55,}", - "^[^ณ]\\s", - "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s", # orphan - "^\\S{2,3}\\s", - "\\s\\S{1,3}$", - "\\s[และ|หรือ|กับ|เช่น]$", # orphan + "^[\u200b\u200c]*[^ณ]\\s", + "^[\u200b\u200c]*[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s", # orphan + "^\\S{2,3}[\u200b\u200c]*\\s", + "\\s\\S{1,3}[\u200b\u200c]*$", + "\\s[และ|หรือ|กับ|เช่น][\u200b\u200c]*$", # orphan "[\u0e40\u0e41\u0e42\u0e43\u0e44]{2,}", "[\u0e30\u0e32\u0e33\u0e45]{2,}", "[\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47]{2,}", From 4e050352ad3d79de1d6a2e202095cb135fe7b7a5 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 18 Apr 2021 09:54:54 +0700 Subject: [PATCH 6/9] Allow question mark and exclamation mark --- src/rules/th.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rules/th.toml b/src/rules/th.toml index 25c18181..ad7376cd 100644 --- a/src/rules/th.toml +++ b/src/rules/th.toml @@ -19,7 +19,7 @@ quote_start_with_letter = true needs_punctuation_end = false needs_letter_start = true needs_uppercase_start = false -allowed_symbols_regex = "[0-9\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c‘’‚;:“”\u0020\u0022\u0027\u0060\\-\\.]" +allowed_symbols_regex = "[0-9\u0020\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c\\-\\.‚;:!\\?“”‘’\u0022\u0027\u0060]" even_symbols = [ "\u0022", # Quotation mark "\u0027" # Apostrophe From 37e113db8660ff26666748d37e68b726174ca826 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 25 Apr 2021 17:17:44 +0700 Subject: [PATCH 7/9] - Remove U+2063 (invisible separator) which occurs in Thai text cut & pasted from some text editors (like MS Word and iOS Notes) - Simplify rules to reflex the fact that `replacements` will run before other rules --- src/rules/th.toml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/rules/th.toml b/src/rules/th.toml index ad7376cd..f9bd3b07 100644 --- a/src/rules/th.toml +++ b/src/rules/th.toml @@ -19,7 +19,7 @@ quote_start_with_letter = true needs_punctuation_end = false needs_letter_start = true needs_uppercase_start = false -allowed_symbols_regex = "[0-9\u0020\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c\\-\\.‚;:!\\?“”‘’\u0022\u0027\u0060]" +allowed_symbols_regex = "[0-9\u0020\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c\\-\\.‚;:!\\?“”‘’\u0022\u0027\u0060]" even_symbols = [ "\u0022", # Quotation mark "\u0027" # Apostrophe @@ -81,11 +81,11 @@ other_patterns = [ "(^|\\s+)[\u0e30\u0e32\u0e33\u0e45\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]", # BEGIN_REGEX "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)", # END_REGEX "[\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{55,}", - "^[\u200b\u200c]*[^ณ]\\s", - "^[\u200b\u200c]*[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s", # orphan - "^\\S{2,3}[\u200b\u200c]*\\s", - "\\s\\S{1,3}[\u200b\u200c]*$", - "\\s[และ|หรือ|กับ|เช่น][\u200b\u200c]*$", # orphan + "^[^ณ]\\s", + "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s", # orphan starting + "^\\S{2,3}\\s", # orphan starting + "\\s\\S{1,3}$", # orphan ending + "\\s[และ|หรือ|กับ|เช่น]$", # orphan ending "[\u0e40\u0e41\u0e42\u0e43\u0e44]{2,}", "[\u0e30\u0e32\u0e33\u0e45]{2,}", "[\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47]{2,}", @@ -107,6 +107,7 @@ other_patterns = [ replacements = [ ["\u200b", ""], # remove zero-width space ["\u200c", ""], # remove zero-width non-joiner + ["\u2063", ""], # remove invisible separator [" พ.ร.บ.", " พระราชบัญญัติ"], [" พ.ร.ก.", " พระราชกำหนด"], [" พ.ศ. ", " พุทธศักราช "], From 60956ec0fd6356ec5c813ac0b1be24146a686d2a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 8 Jun 2021 08:41:39 +0100 Subject: [PATCH 8/9] Adjust other_patterns and replacements --- src/rules/th.toml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/rules/th.toml b/src/rules/th.toml index f9bd3b07..f78522f7 100644 --- a/src/rules/th.toml +++ b/src/rules/th.toml @@ -22,7 +22,6 @@ needs_uppercase_start = false allowed_symbols_regex = "[0-9\u0020\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c\\-\\.‚;:!\\?“”‘’\u0022\u0027\u0060]" even_symbols = [ "\u0022", # Quotation mark - "\u0027" # Apostrophe ] matching_symbols = [ ["‘", "’"], @@ -77,12 +76,14 @@ other_patterns = [ "[\\.,:;-]$", "[,:;]\\S", "[\\.|\\?|!].+$", - "^.{81,}$", # MAX_LENGTH - "(^|\\s+)[\u0e30\u0e32\u0e33\u0e45\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]", # BEGIN_REGEX - "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)", # END_REGEX - "[\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{55,}", - "^[^ณ]\\s", - "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s", # orphan starting + "^.{101,}$", # MAX_LENGTH = 100 + "(^|\\s+)[\u0e30\u0e32\u0e33\u0e45\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]", # invalid char to start word/sentence + "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)", # invalid char to end word/sentence + "[\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{81,}", # more than 80 chars running without a space + "[\u0e01-\u0e4e]{71,}", # more than Thai 70 consonants and vowels running without a space + "[ก-ฮ]{31,}", # more than 30 consonants running without a space + "^[^ณ]\\s", # lone character at the beginning of a sentence (except No Nen) + "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s", # words that indicate orphan starting "^\\S{2,3}\\s", # orphan starting "\\s\\S{1,3}$", # orphan ending "\\s[และ|หรือ|กับ|เช่น]$", # orphan ending @@ -120,8 +121,6 @@ replacements = [ [" !", "!"], [" ?", "?"], [":", ": "], - ["?", "? "], - ["!", "! "], [",", " "], ["..", " "], ["...", " "], From 9e5ec796ae04ba4c9a00d2f323096283c4a85fb3 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Wed, 9 Jun 2021 02:08:47 +0100 Subject: [PATCH 9/9] deal with double vowels --- src/rules/th.toml | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/src/rules/th.toml b/src/rules/th.toml index f78522f7..68494c43 100644 --- a/src/rules/th.toml +++ b/src/rules/th.toml @@ -109,11 +109,36 @@ replacements = [ ["\u200b", ""], # remove zero-width space ["\u200c", ""], # remove zero-width non-joiner ["\u2063", ""], # remove invisible separator - [" พ.ร.บ.", " พระราชบัญญัติ"], - [" พ.ร.ก.", " พระราชกำหนด"], - [" พ.ศ. ", " พุทธศักราช "], - [" ค.ศ. ", " คริสต์ศักราช "], - [" ม.ร.ว.", " หม่อมราชวงศ์"], + ["\u0e30\u0e30", "\u0e30"], # double vowel + ["\u0e31\u0e31", "\u0e31"], # double vowel + ["\u0e32\u0e32", "\u0e32"], # double vowel + ["\u0e33\u0e33", "\u0e33"], # double vowel + ["\u0e34\u0e34", "\u0e34"], # double vowel + ["\u0e35\u0e35", "\u0e35"], # double vowel + ["\u0e36\u0e36", "\u0e36"], # double vowel + ["\u0e37\u0e37", "\u0e37"], # double vowel + ["\u0e38\u0e38", "\u0e38"], # double vowel + ["\u0e39\u0e39", "\u0e39"], # double vowel + ["\u0e3a\u0e3a", "\u0e3a"], # double vowel + ["\u0e41\u0e41", "\u0e41"], # double vowel + ["\u0e42\u0e42", "\u0e42"], # double vowel + ["\u0e43\u0e43", "\u0e43"], # double vowel + ["\u0e44\u0e44", "\u0e44"], # double vowel + ["\u0e45\u0e45", "\u0e45"], # double vowel + ["\u0e46\u0e46", "\u0e46"], # double Maiyamok + ["\u0e47\u0e47", "\u0e47"], # double vowel + ["\u0e48\u0e48", "\u0e48"], # double tone mark + ["\u0e49\u0e49", "\u0e49"], # double tone mark + ["\u0e4a\u0e4a", "\u0e4a"], # double tone mark + ["\u0e4b\u0e4b", "\u0e4b"], # double tone mark + ["\u0e4c\u0e4c", "\u0e4c"], # double symbol + ["\u0e4d\u0e4d", "\u0e4d"], # double symbol + ["\u0e4e\u0e4e", "\u0e4e"], # double symbol + [" พ.ร.บ.", " พระราชบัญญัติ"], # abbreviation + [" พ.ร.ก.", " พระราชกำหนด"], # abbreviation + [" พ.ศ. ", " พุทธศักราช "], # abbreviation + [" ค.ศ. ", " คริสต์ศักราช "], # abbreviation + [" ม.ร.ว.", " หม่อมราชวงศ์"], # abbreviation [" .", "."], [" ,", " "], [" :", ":"],