From 79a929c92fe8f18c954baa2e3263f7e6a027f7e9 Mon Sep 17 00:00:00 2001 From: ava Date: Fri, 17 Jan 2020 23:08:48 +0900 Subject: [PATCH 1/2] Correction for text with punctuation and dash --- text/cleaners.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/text/cleaners.py b/text/cleaners.py index e2e35c1a8..15d015542 100755 --- a/text/cleaners.py +++ b/text/cleaners.py @@ -17,8 +17,10 @@ from .numbers import normalize_numbers -# Regular expression matching whitespace: +# Regular expression matching whitespace punctuation, dash: _whitespace_re = re.compile(r'\s+') +_punctuation_re = re.compile(r'[!@#$%^&*+-?/,.;]') +_dash_re = re.compile(r'[-]') # List of (regular expression, replacement) pairs for abbreviations: _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ @@ -65,6 +67,14 @@ def convert_to_ascii(text): return unidecode(text) +def replace_dash(text): + return re.sub(_dash_re, '', text) + + +def collapse_punctuation(text): + return re.sub(_punctuation_re, '', text) + + def basic_cleaners(text): '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' text = lowercase(text) @@ -87,4 +97,6 @@ def english_cleaners(text): text = expand_numbers(text) text = expand_abbreviations(text) text = collapse_whitespace(text) + text = collapse_punctuation(text) + text = replace_dash(text) return text From ddf84929f6e2efc631d8d5c9f44abb6a11a6e143 Mon Sep 17 00:00:00 2001 From: ava Date: Fri, 17 Jan 2020 23:24:46 +0900 Subject: [PATCH 2/2] Correction for text with punctuation and dash --- text/cleaners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text/cleaners.py b/text/cleaners.py index 15d015542..7d85f9daf 100755 --- a/text/cleaners.py +++ b/text/cleaners.py @@ -19,7 +19,7 @@ # Regular expression matching whitespace punctuation, dash: _whitespace_re = re.compile(r'\s+') -_punctuation_re = re.compile(r'[!@#$%^&*+-?/,.;]') +_punctuation_re = re.compile(r'[()!@#$%^&*+?/,.;\"\'\[\]]') _dash_re = re.compile(r'[-]') # List of (regular expression, replacement) pairs for abbreviations: