From 52604ac27430aafdab1891bb7942270531a96844 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 12 Dec 2020 07:48:02 +0700
Subject: [PATCH 1/9] Initial rule for Thai sentences

---
 src/rules/th.toml | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 src/rules/th.toml

diff --git a/src/rules/th.toml b/src/rules/th.toml
new file mode 100644
index 00000000..58b777bd
--- /dev/null
+++ b/src/rules/th.toml
@@ -0,0 +1,36 @@
+min_trimmed_length = 3
+min_word_count = 1
+max_word_count = 14
+min_characters = 0
+may_end_with_colon = false
+quote_start_with_letter = true
+needs_punctuation_end = false
+needs_letter_start = true
+needs_uppercase_start = false
+disallowed_symbols = [
+  '«', '»', '”', 'º', '&',
+  '<', '>', '+', '*', '\', '#', '@', '^', '[', ']', '(', ')', '/',
+  'ä', 'ö', 'ü', 'ß','ð','ð',
+  'À','Á','Â','Ã','Ä','Å','Æ','Ç','È','É','Ê','Ë','Ì','Í','İ','Î','Ï','Ð','Ñ','Ò','Ó','Ô','Õ','Ö','Ø','Ù','Ú','Û','Û','Ü','Ý','Ž',
+  'à','á','â','ã','å','æ','ç','è','é','ê','ë','ì','í','î','ï','ð','ñ','ò','ó','ô','õ',
+  'ø','ù','ú','û','ý','þ','ÿ',
+  'ā','ă','ą','ć','ċ','č','ď','đ','ē','ĕ','ė','ę','ě','ğ','ġ','ģ','ħ','ĩ','ī','ĭ','į','ı','ķ','ĸ','ĺ','ļ',
+  'ľ','ŀ','ł','ń','ņ','ņ','ṫ','š','Ў',
+  'ḃ','ḋ','ḟ','ṁ','ṗ','ṡ','ẁ','ẃ','ẅ','ẛ','ỳ',
+  'α', 'β', 'Γ', 'γ', 'Δ', 'δ', 'ε', 'ζ', 'η', 'Θ', 'θ', 'ι', 'κ',
+  'Λ', 'λ', 'μ', 'ν', 'Ξ', 'ξ', 'Π', 'π', 'ρ', 'Σ', 'σ', 'ς', 'τ',
+  'υ', 'Φ', 'φ', 'χ', 'Ψ', 'ψ', 'Ω', 'ω',
+  'っ','ł', '≤','蘇','ा','≥','ि','җ','پ','국','ी','ň','ก','ō','े',
+  'ŏ','×','ő','œ','銭','्','旗','а','瓊','б','名','ҫ','в','ř','飛','어','г','д','ǔ',
+  'ś','厘','е','♆','ж','建','†','з','●','∅','и','ş','•','｜','福','й','海',
+  'к','∈','л','⊂','м','н','…','→','日','ť','о','п','ə','พ','ℝ','р','៛','с','龍','ū','т',
+  'у','ㇺ','ф','ů','₫','わ','ч','ű','ш','щ','ک','ъ','ы','ь','脂','龙','凍','ю','∞','ӈ',
+  'я','ط','ط','来','゚','ა','₷','ż','ع','脈','є','ž','=','ă','დ','吴','і','ე','ї',
+  'ვ','∩','თ','♭','粉','ი','∪','კ','玉','赤','ლ','�','č','პ','ә','孙','曽','რ','ḥ','3','ن','ه','卍',
+  'و','უ','א','ي','ב','ქ','明','⟨','ה','ו','백','ט','发','י','ی','क','한','כ','ө','有','§',
+  'ե','温','ª','키','_','白','ș','°','ड','ț','ק','ר','ツ','ת','द','口','石','泉','·','न','紫',
+  'प','山','ề','藻','य','վ','す','騨','ệ','木','व','记','天','青','{','ह','|','安','위','}','竜','~',
+  'ฯ', 'ๆ', '๏', '๚', '๛',
+]
+broken_whitespace = ["  ", " ,", " .", " ;"]
+abbreviation_patterns = ["[A-Z]{2,}|[A-Z]+\\.*[A-Z]+|[ก-ฮ]+\\.([ก-ฮ]+\.)+"]

From 1cb7a927c83f48dfdc0fc991d5602ce50c33422c Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sat, 12 Dec 2020 07:57:53 +0700
Subject: [PATCH 2/9] Update th.toml

---
 src/rules/th.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rules/th.toml b/src/rules/th.toml
index 58b777bd..6ddbf946 100644
--- a/src/rules/th.toml
+++ b/src/rules/th.toml
@@ -33,4 +33,4 @@ disallowed_symbols = [
   'ฯ', 'ๆ', '๏', '๚', '๛',
 ]
 broken_whitespace = ["  ", " ,", " .", " ;"]
-abbreviation_patterns = ["[A-Z]{2,}|[A-Z]+\\.*[A-Z]+|[ก-ฮ]+\\.([ก-ฮ]+\.)+"]
+abbreviation_patterns = ["[A-Z]{2,}|[A-Z]+\\.*[A-Z]+|[ก-ฮ]+\\.([ก-ฮ]+\\.)+"]

From 736d4aa36084840d783a659e68976cf1f16f8bed Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 15 Apr 2021 10:20:02 +0700
Subject: [PATCH 3/9] Add/adjust more rules for Thai

---
 src/rules/th.toml | 156 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 129 insertions(+), 27 deletions(-)

diff --git a/src/rules/th.toml b/src/rules/th.toml
index 6ddbf946..005e3545 100644
--- a/src/rules/th.toml
+++ b/src/rules/th.toml
@@ -1,36 +1,138 @@
+# "other_patterns" borrowing these patterns:
+# BEGIN_REGEX, END_REGEX, STRUCTURE_REGEX, and ABBREVIATION_REGEX
+# with few adjustments, from:
+# https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/languages/th.js
+#
+# "replacements" borrowing normalization/cleanup patterns,
+# with some adjustments, from:
+# https://github.com/common-voice/sentence-collector/blob/main/server/lib/cleanup/languages/th.js
+
 min_trimmed_length = 3
+# Currently, without Thai word tokenization,
+# this "word" will work effectively for Thai
+# as "a group of character whitespace/punctuation".
 min_word_count = 1
-max_word_count = 14
-min_characters = 0
+max_word_count = 5
+min_characters = 6
 may_end_with_colon = false
 quote_start_with_letter = true
 needs_punctuation_end = false
 needs_letter_start = true
 needs_uppercase_start = false
-disallowed_symbols = [
-  '«', '»', '”', 'º', '&',
-  '<', '>', '+', '*', '\', '#', '@', '^', '[', ']', '(', ')', '/',
-  'ä', 'ö', 'ü', 'ß','ð','ð',
-  'À','Á','Â','Ã','Ä','Å','Æ','Ç','È','É','Ê','Ë','Ì','Í','İ','Î','Ï','Ð','Ñ','Ò','Ó','Ô','Õ','Ö','Ø','Ù','Ú','Û','Û','Ü','Ý','Ž',
-  'à','á','â','ã','å','æ','ç','è','é','ê','ë','ì','í','î','ï','ð','ñ','ò','ó','ô','õ',
-  'ø','ù','ú','û','ý','þ','ÿ',
-  'ā','ă','ą','ć','ċ','č','ď','đ','ē','ĕ','ė','ę','ě','ğ','ġ','ģ','ħ','ĩ','ī','ĭ','į','ı','ķ','ĸ','ĺ','ļ',
-  'ľ','ŀ','ł','ń','ņ','ņ','ṫ','š','Ў',
-  'ḃ','ḋ','ḟ','ṁ','ṗ','ṡ','ẁ','ẃ','ẅ','ẛ','ỳ',
-  'α', 'β', 'Γ', 'γ', 'Δ', 'δ', 'ε', 'ζ', 'η', 'Θ', 'θ', 'ι', 'κ',
-  'Λ', 'λ', 'μ', 'ν', 'Ξ', 'ξ', 'Π', 'π', 'ρ', 'Σ', 'σ', 'ς', 'τ',
-  'υ', 'Φ', 'φ', 'χ', 'Ψ', 'ψ', 'Ω', 'ω',
-  'っ','ł', '≤','蘇','ा','≥','ि','җ','پ','국','ी','ň','ก','ō','े',
-  'ŏ','×','ő','œ','銭','्','旗','а','瓊','б','名','ҫ','в','ř','飛','어','г','д','ǔ',
-  'ś','厘','е','♆','ж','建','†','з','●','∅','и','ş','•','｜','福','й','海',
-  'к','∈','л','⊂','м','н','…','→','日','ť','о','п','ə','พ','ℝ','р','៛','с','龍','ū','т',
-  'у','ㇺ','ф','ů','₫','わ','ч','ű','ш','щ','ک','ъ','ы','ь','脂','龙','凍','ю','∞','ӈ',
-  'я','ط','ط','来','゚','ა','₷','ż','ع','脈','є','ž','=','ă','დ','吴','і','ე','ї',
-  'ვ','∩','თ','♭','粉','ი','∪','კ','玉','赤','ლ','�','č','პ','ә','孙','曽','რ','ḥ','3','ن','ه','卍',
-  'و','უ','א','ي','ב','ქ','明','⟨','ה','ו','백','ט','发','י','ی','क','한','כ','ө','有','§',
-  'ե','温','ª','키','_','白','ș','°','ड','ț','ק','ר','ツ','ת','द','口','石','泉','·','न','紫',
-  'प','山','ề','藻','य','վ','す','騨','ệ','木','व','记','天','青','{','ह','|','安','위','}','竜','~',
-  'ฯ', 'ๆ', '๏', '๚', '๛',
+allowed_symbols_regex = "[0-9\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c‘’‚;:“”\u0020\u0022\u0027\u0060\\-\\.]"
+even_symbols = [
+  "\u0022",  # Quotation mark
+  "\u0027"   # Apostrophe
+]
+matching_symbols = [
+  ["‘", "’"],
+  ["“", "”"]
 ]
 broken_whitespace = ["  ", " ,", " .", " ;"]
-abbreviation_patterns = ["[A-Z]{2,}|[A-Z]+\\.*[A-Z]+|[ก-ฮ]+\\.([ก-ฮ]+\\.)+"]
+
+abbreviation_patterns = [
+  "[A-Z]{2,}",
+  "[A-Z]+\\.*[A-Z]+",
+  "[ก-ฮ]{1,3}\\.([ก-ฮ]{1,3}\\.)+"
+]
+
+# Other patterns
+# - No sentences end with period, comma, colon, semicolon, dash
+# - No characters immediately after comma, colon, semicolon
+# - Missing whitespace after comma, semicolon, etc.
+#
+# MAX_LENGTH:
+# - Sentence longer than 80 characters
+#
+# BEGIN_REGEX:
+# These Thai chars cannot start the word:
+# - All vowels except lead vowels
+# - Tone marks
+# - Phinthu, Thanthakhat, Nikhahit, Yamakkan
+#
+# END_REGEX:
+# These Thai chars cannot end the word:
+# - Lead vowels
+#
+# STRUCTURE_REGEX:
+# - 50 running characters or more without space (difficult to read)
+# - Very short word following by a space at the beginning of sentence (maybe orphan)
+#
+# These classes of Thai characters are not allowed to be immediately repeated:
+# - Lead vowels: \u0e40\u0e41\u0e42\u0e43\u0e44
+# - Follow vowels: \u0e30\u0e32\u0e33\u0e45
+# - Above vowels: \u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47
+# - Below vowels: \u0e38\u0e39
+# - Tone marks: \u0e48\u0e49\u0e4a\u0e4b
+# - Phinthu: \u0e3a
+# - Thanthakhat: \u0e4c
+# - Nikhahit: \u0e4d
+# - Yamakkan: \u0e4e
+#
+# These classes of Thai characters have a specific legitimate order:
+# - Tone marks/Pinthu/Thanthakat/Nikhahit/Yamakkan can't immediately come after lead and follow vowels
+# - Tone marks/Pinthu/Thanthakat/Nikhahit/Yamakkan can't immediately come before above and below vowels
+other_patterns = [
+  "[\\.,:;-]$",
+  "[,:;]\\S",
+  "[\\.|\\?|!].+$",
+  "^.{81,}$",  # MAX_LENGTH
+  "(^|\\s+)[\u0e30\u0e32\u0e33\u0e45\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]",  # BEGIN_REGEX
+  "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)",  # END_REGEX
+  "[\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{55,}",
+  "^[^ณ]\\s",
+  "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|และ|หรือ|กับ|ก็]\\s",  # orphan
+  "^\\S{2,3}\\s",
+  "\\s\\S{1,3}$",
+  "[\u0e40\u0e41\u0e42\u0e43\u0e44]{2,}",
+  "[\u0e30\u0e32\u0e33\u0e45]{2,}",
+  "[\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47]{2,}",
+  "[\u0e38\u0e39]{2,}",
+  "[\u0e48\u0e49\u0e4a\u0e4b]{2,}",
+  "\u0e3A{2,}",
+  "\u0e4C{2,}",
+  "\u0e4D{2,}",
+  "\u0e4E{2,}",
+  "[\u0e40\u0e41\u0e42\u0e43\u0e44\u0e30\u0e32\u0e33\u0e45][\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]",
+  "[\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e][\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39]",
+]
+
+# Replacements
+# - Remove zero-width characters
+# - Expand abbreviations
+# - Remove punctuations
+# - Normalize character orders
+replacements = [
+  ["\u200b", ""],  # remove zero-width space
+  ["\u200c", ""],  # remove zero-width non-joiner
+  [" พ.ร.บ.", " พระราชบัญญัติ"],
+  [" พ.ร.ก.", " พระราชกำหนด"],
+  [" พ.ศ. ", " พุทธศักราช "],
+  [" ค.ศ. ", " คริสต์ศักราช "],
+  [" ม.ร.ว.", " หม่อมราชวงศ์"],
+  [" .", "."],
+  [" ,", " "],
+  [" :", ":"],
+  [" ;", ";"],
+  [" !", "!"],
+  [" ?", "?"],
+  [":", ": "],
+  ["?", "? "],
+  ["!", "! "],  
+  [",", " "],
+  ["..", " "],
+  ["...", " "],
+  ["....", " "],
+  [" .", "."],
+  ["    ", " "],
+  ["   ", " "],
+  ["  ", " "],
+  ["\u0e40\u0e40", "\u0e41"],  # Sara E + Sara E -> Sara Ae
+  ["\u0e4d\u0e32", "\u0e33"],  # Nikhahit + Sara Aa -> Sara Am
+  ["\u0e4d\u0e48\u0e32", "\u0e48\u0e33"],  # Nikhahit + Mai Ek + Sara Aa -> Sara Am
+  ["\u0e4d\u0e49\u0e32", "\u0e49\u0e33"],  # Nikhahit + Mai Tho + Sara Aa -> Sara Am
+  ["\u0e4d\u0e4A\u0e32", "\u0e4A\u0e33"],  # Nikhahit + Mai Tri + Sara Aa -> Sara Am
+  ["\u0e4d\u0e4B\u0e32", "\u0e4B\u0e33"],  # Nikhahit + Mai Chattawa + Sara Aa -> Sara Am
+  ["\u0e24\u0e32", "\u0e24\u0e45"],  # Ru + Sara Aa -> Ru + Lakkhangyao
+  ["\u0e26\u0e32", "\u0e26\u0e45"],  # Lu + Sara Aa -> Lu + Lakkhangyao
+]

From decd939fe5d31fa5b000183142c7288a7abfa685 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 15 Apr 2021 10:24:32 +0700
Subject: [PATCH 4/9] Add more orphan word rules

---
 src/rules/th.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/rules/th.toml b/src/rules/th.toml
index 005e3545..bb7b7cea 100644
--- a/src/rules/th.toml
+++ b/src/rules/th.toml
@@ -57,6 +57,7 @@ abbreviation_patterns = [
 # STRUCTURE_REGEX:
 # - 50 running characters or more without space (difficult to read)
 # - Very short word following by a space at the beginning of sentence (maybe orphan)
+# - Avoid orphan words at the beginning and end of sentence
 #
 # These classes of Thai characters are not allowed to be immediately repeated:
 # - Lead vowels: \u0e40\u0e41\u0e42\u0e43\u0e44
@@ -81,9 +82,10 @@ other_patterns = [
   "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)",  # END_REGEX
   "[\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{55,}",
   "^[^ณ]\\s",
-  "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|และ|หรือ|กับ|ก็]\\s",  # orphan
+  "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s",  # orphan
   "^\\S{2,3}\\s",
   "\\s\\S{1,3}$",
+  "\\s[และ|หรือ|กับ|เช่น]$",  # orphan
   "[\u0e40\u0e41\u0e42\u0e43\u0e44]{2,}",
   "[\u0e30\u0e32\u0e33\u0e45]{2,}",
   "[\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47]{2,}",

From 6b9030b6100870e20241103fb2927e46eb171fd7 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Fri, 16 Apr 2021 12:49:24 +0700
Subject: [PATCH 5/9] Deal with zero-width spaces

---
 src/rules/th.toml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/rules/th.toml b/src/rules/th.toml
index bb7b7cea..25c18181 100644
--- a/src/rules/th.toml
+++ b/src/rules/th.toml
@@ -81,11 +81,11 @@ other_patterns = [
   "(^|\\s+)[\u0e30\u0e32\u0e33\u0e45\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]",  # BEGIN_REGEX
   "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)",  # END_REGEX
   "[\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{55,}",
-  "^[^ณ]\\s",
-  "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s",  # orphan
-  "^\\S{2,3}\\s",
-  "\\s\\S{1,3}$",
-  "\\s[และ|หรือ|กับ|เช่น]$",  # orphan
+  "^[\u200b\u200c]*[^ณ]\\s",
+  "^[\u200b\u200c]*[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s",  # orphan
+  "^\\S{2,3}[\u200b\u200c]*\\s",
+  "\\s\\S{1,3}[\u200b\u200c]*$",
+  "\\s[และ|หรือ|กับ|เช่น][\u200b\u200c]*$",  # orphan
   "[\u0e40\u0e41\u0e42\u0e43\u0e44]{2,}",
   "[\u0e30\u0e32\u0e33\u0e45]{2,}",
   "[\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47]{2,}",

From 4e050352ad3d79de1d6a2e202095cb135fe7b7a5 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sun, 18 Apr 2021 09:54:54 +0700
Subject: [PATCH 6/9] Allow question mark and exclamation mark

---
 src/rules/th.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rules/th.toml b/src/rules/th.toml
index 25c18181..ad7376cd 100644
--- a/src/rules/th.toml
+++ b/src/rules/th.toml
@@ -19,7 +19,7 @@ quote_start_with_letter = true
 needs_punctuation_end = false
 needs_letter_start = true
 needs_uppercase_start = false
-allowed_symbols_regex = "[0-9\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c‘’‚;:“”\u0020\u0022\u0027\u0060\\-\\.]"
+allowed_symbols_regex = "[0-9\u0020\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c\\-\\.‚;:!\\?“”‘’\u0022\u0027\u0060]"
 even_symbols = [
   "\u0022",  # Quotation mark
   "\u0027"   # Apostrophe

From 37e113db8660ff26666748d37e68b726174ca826 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sun, 25 Apr 2021 17:17:44 +0700
Subject: [PATCH 7/9] - Remove U+2063 (invisible separator) which occurs in
 Thai text cut & pasted from some text editors (like MS Word and iOS Notes) -
 Simplify rules to reflex the fact that `replacements` will run before other
 rules

---
 src/rules/th.toml | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/rules/th.toml b/src/rules/th.toml
index ad7376cd..f9bd3b07 100644
--- a/src/rules/th.toml
+++ b/src/rules/th.toml
@@ -19,7 +19,7 @@ quote_start_with_letter = true
 needs_punctuation_end = false
 needs_letter_start = true
 needs_uppercase_start = false
-allowed_symbols_regex = "[0-9\u0020\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c\\-\\.‚;:!\\?“”‘’\u0022\u0027\u0060]"
+allowed_symbols_regex = "[0-9\u0020\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c\\-\\.‚;:!\\?“”‘’\u0022\u0027\u0060]"
 even_symbols = [
   "\u0022",  # Quotation mark
   "\u0027"   # Apostrophe
@@ -81,11 +81,11 @@ other_patterns = [
   "(^|\\s+)[\u0e30\u0e32\u0e33\u0e45\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]",  # BEGIN_REGEX
   "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)",  # END_REGEX
   "[\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{55,}",
-  "^[\u200b\u200c]*[^ณ]\\s",
-  "^[\u200b\u200c]*[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s",  # orphan
-  "^\\S{2,3}[\u200b\u200c]*\\s",
-  "\\s\\S{1,3}[\u200b\u200c]*$",
-  "\\s[และ|หรือ|กับ|เช่น][\u200b\u200c]*$",  # orphan
+  "^[^ณ]\\s",
+  "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s",  # orphan starting
+  "^\\S{2,3}\\s",  # orphan starting
+  "\\s\\S{1,3}$",  # orphan ending
+  "\\s[และ|หรือ|กับ|เช่น]$",  # orphan ending
   "[\u0e40\u0e41\u0e42\u0e43\u0e44]{2,}",
   "[\u0e30\u0e32\u0e33\u0e45]{2,}",
   "[\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47]{2,}",
@@ -107,6 +107,7 @@ other_patterns = [
 replacements = [
   ["\u200b", ""],  # remove zero-width space
   ["\u200c", ""],  # remove zero-width non-joiner
+  ["\u2063", ""],  # remove invisible separator
   [" พ.ร.บ.", " พระราชบัญญัติ"],
   [" พ.ร.ก.", " พระราชกำหนด"],
   [" พ.ศ. ", " พุทธศักราช "],

From 60956ec0fd6356ec5c813ac0b1be24146a686d2a Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Tue, 8 Jun 2021 08:41:39 +0100
Subject: [PATCH 8/9] Adjust other_patterns and replacements

---
 src/rules/th.toml | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/rules/th.toml b/src/rules/th.toml
index f9bd3b07..f78522f7 100644
--- a/src/rules/th.toml
+++ b/src/rules/th.toml
@@ -22,7 +22,6 @@ needs_uppercase_start = false
 allowed_symbols_regex = "[0-9\u0020\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e45\u0e47-\u0e4c\\-\\.‚;:!\\?“”‘’\u0022\u0027\u0060]"
 even_symbols = [
   "\u0022",  # Quotation mark
-  "\u0027"   # Apostrophe
 ]
 matching_symbols = [
   ["‘", "’"],
@@ -77,12 +76,14 @@ other_patterns = [
   "[\\.,:;-]$",
   "[,:;]\\S",
   "[\\.|\\?|!].+$",
-  "^.{81,}$",  # MAX_LENGTH
-  "(^|\\s+)[\u0e30\u0e32\u0e33\u0e45\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]",  # BEGIN_REGEX
-  "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)",  # END_REGEX
-  "[\u200b\u200c\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{55,}",
-  "^[^ณ]\\s",
-  "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s",  # orphan starting
+  "^.{101,}$",  # MAX_LENGTH = 100
+  "(^|\\s+)[\u0e30\u0e32\u0e33\u0e45\u0e31\u0e34\u0e35\u0e36\u0e37\u0e4d\u0e47\u0e38\u0e39\u0e48\u0e49\u0e4a\u0e4b\u0e3a\u0e4c\u0e4d\u0e4e]",  # invalid char to start word/sentence
+  "[\u0e40\u0e41\u0e42\u0e43\u0e44](\\s+|$)",  # invalid char to end word/sentence
+  "[\u0e01-\u0e2e\u0e30-\u0e39\u0e40-\u0e4c‘’‚;:“”\u0022\u0027\u0060\\-\\?\\.!]{81,}",  # more than 80 chars running without a space
+  "[\u0e01-\u0e4e]{71,}",  # more than Thai 70 consonants and vowels running without a space
+  "[ก-ฮ]{31,}",  # more than 30 consonants running without a space
+  "^[^ณ]\\s",  # lone character at the beginning of a sentence (except No Nen)
+  "^[บ้าง|ก่อน|เลย|แล้ว|หรือไม่|ไหม|ล่ะ|ด้วย|อีก|และ|หรือ|กับ|ก็]\\s",  # words that indicate orphan starting
   "^\\S{2,3}\\s",  # orphan starting
   "\\s\\S{1,3}$",  # orphan ending
   "\\s[และ|หรือ|กับ|เช่น]$",  # orphan ending
@@ -120,8 +121,6 @@ replacements = [
   [" !", "!"],
   [" ?", "?"],
   [":", ": "],
-  ["?", "? "],
-  ["!", "! "],  
   [",", " "],
   ["..", " "],
   ["...", " "],

From 9e5ec796ae04ba4c9a00d2f323096283c4a85fb3 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Wed, 9 Jun 2021 02:08:47 +0100
Subject: [PATCH 9/9] deal with double vowels

---
 src/rules/th.toml | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/src/rules/th.toml b/src/rules/th.toml
index f78522f7..68494c43 100644
--- a/src/rules/th.toml
+++ b/src/rules/th.toml
@@ -109,11 +109,36 @@ replacements = [
   ["\u200b", ""],  # remove zero-width space
   ["\u200c", ""],  # remove zero-width non-joiner
   ["\u2063", ""],  # remove invisible separator
-  [" พ.ร.บ.", " พระราชบัญญัติ"],
-  [" พ.ร.ก.", " พระราชกำหนด"],
-  [" พ.ศ. ", " พุทธศักราช "],
-  [" ค.ศ. ", " คริสต์ศักราช "],
-  [" ม.ร.ว.", " หม่อมราชวงศ์"],
+  ["\u0e30\u0e30", "\u0e30"],  # double vowel
+  ["\u0e31\u0e31", "\u0e31"],  # double vowel
+  ["\u0e32\u0e32", "\u0e32"],  # double vowel
+  ["\u0e33\u0e33", "\u0e33"],  # double vowel
+  ["\u0e34\u0e34", "\u0e34"],  # double vowel
+  ["\u0e35\u0e35", "\u0e35"],  # double vowel
+  ["\u0e36\u0e36", "\u0e36"],  # double vowel
+  ["\u0e37\u0e37", "\u0e37"],  # double vowel
+  ["\u0e38\u0e38", "\u0e38"],  # double vowel
+  ["\u0e39\u0e39", "\u0e39"],  # double vowel
+  ["\u0e3a\u0e3a", "\u0e3a"],  # double vowel
+  ["\u0e41\u0e41", "\u0e41"],  # double vowel
+  ["\u0e42\u0e42", "\u0e42"],  # double vowel
+  ["\u0e43\u0e43", "\u0e43"],  # double vowel
+  ["\u0e44\u0e44", "\u0e44"],  # double vowel
+  ["\u0e45\u0e45", "\u0e45"],  # double vowel
+  ["\u0e46\u0e46", "\u0e46"],  # double Maiyamok
+  ["\u0e47\u0e47", "\u0e47"],  # double vowel
+  ["\u0e48\u0e48", "\u0e48"],  # double tone mark
+  ["\u0e49\u0e49", "\u0e49"],  # double tone mark
+  ["\u0e4a\u0e4a", "\u0e4a"],  # double tone mark
+  ["\u0e4b\u0e4b", "\u0e4b"],  # double tone mark
+  ["\u0e4c\u0e4c", "\u0e4c"],  # double symbol
+  ["\u0e4d\u0e4d", "\u0e4d"],  # double symbol
+  ["\u0e4e\u0e4e", "\u0e4e"],  # double symbol
+  [" พ.ร.บ.", " พระราชบัญญัติ"],  # abbreviation
+  [" พ.ร.ก.", " พระราชกำหนด"],  # abbreviation
+  [" พ.ศ. ", " พุทธศักราช "],  # abbreviation
+  [" ค.ศ. ", " คริสต์ศักราช "],  # abbreviation
+  [" ม.ร.ว.", " หม่อมราชวงศ์"],  # abbreviation
   [" .", "."],
   [" ,", " "],
   [" :", ":"],