diff --git a/converters/Range.py b/converters/Range.py new file mode 100644 index 00000000..9bbf8e02 --- /dev/null +++ b/converters/Range.py @@ -0,0 +1,35 @@ + +from singleton_decorator import singleton +import re +from .Cardinal import Cardinal + +@singleton +class Range: + """ + Steps: + - Check for - splitting numbers + + Note: + Punctuation always stays the same + """ + def __init__(self): + super().__init__() + self.cardinal = Cardinal() + + def convert(self, token: str) -> str: + numbers = re.split('-', token) + if len(numbers) == 1 : + token = self.cardinal.convert(numbers[0]) + elif len(numbers) == 2 : + + token = self.cardinal.convert(numbers[0]) + token += ' to ' + token += self.cardinal.convert(numbers[1]) + + else : + token = '' + for number in numbers : + token += self.cardinal.convert(number) + token += ' ' + + return token diff --git a/text_normalize.py b/text_normalize.py index 5e6b15b8..0ca7e9bf 100644 --- a/text_normalize.py +++ b/text_normalize.py @@ -5,6 +5,7 @@ import unicodedata import os, sys +import re from converters.Plain import Plain from converters.Punct import Punct @@ -23,6 +24,8 @@ from converters.Telephone import Telephone from converters.Address import Address from converters.Roman import Roman +from converters.Range import Range + months = ['jan', 'feb', @@ -64,7 +67,8 @@ "FRACTION": Fraction(), "TELEPHONE": Telephone(), "ADDRESS": Address(), - "ROMAN": Roman() + "ROMAN": Roman(), + "RANGE": Range() } def split_given_size(a, size): @@ -108,6 +112,9 @@ def is_fraction(inputString): def is_decimal(inputString): return "." in inputString +def is_range(inputString) : + return "-" in inputString + def is_url(inputString): return "//" in inputString or ".com" in inputString or ".html" in inputString @@ -119,10 +126,10 @@ def normalize_single(text, prev_text = "", next_text = ""): text = labels['ELECTRONIC'].convert(text).upper() elif has_numbers(text): if has_month(prev_text): - prev_text = prev_text.lower() + prev_text = labels['DATE'].get_month(prev_text.lower()) text = labels['DATE'].convert(prev_text + " " + text).replace(prev_text, "").strip() elif has_month(next_text): - next_text = next_text.lower() + next_text = labels['DATE'].get_month(next_text.lower()) text = labels['DATE'].convert(text + " " + next_text).replace(next_text, "").strip() elif is_oridinal(text): text = labels['ORDINAL'].convert(text) @@ -136,6 +143,8 @@ def normalize_single(text, prev_text = "", next_text = ""): text = labels['DECIMAL'].convert(text) elif is_cardinal(text): text = labels['CARDINAL'].convert(text) + elif is_range(text): + text = labels['RANGE'].convert(text) else: text = labels['DATE'].convert(text) @@ -149,6 +158,7 @@ def normalize_single(text, prev_text = "", next_text = ""): def normalize_text(text): text = remove_accents(text).replace('–', ' to ').replace('-', ' - ').replace(":p", ": p").replace(":P", ": P").replace(":d", ": d").replace(":D", ": D") words = word_tokenize(text) + df = pd.DataFrame(words, columns=['before']) df['after'] = df['before'] @@ -157,4 +167,10 @@ def normalize_text(text): df['after'] = df['previous'].apply(lambda m: normalize_single(m.split('|')[1], m.split('|')[0], m.split('|')[2])) - return TreebankWordDetokenizer().detokenize(df['after'].tolist()).replace("’ s", "'s").replace(" 's", "'s") \ No newline at end of file + return TreebankWordDetokenizer().detokenize(df['after'].tolist()).replace("’ s", "'s").replace(" 's", "'s") + +if __name__ == '__main__' : + text = 'hello (23 Jan 2020, 12:10 AM)' + out = normalize_text(text) + print(out) + diff --git a/train.ipynb b/train.ipynb index 9cc2f4b4..a7becc3d 100644 --- a/train.ipynb +++ b/train.ipynb @@ -117,7 +117,7 @@ " \n", " bert = AlbertModel(albert_base_configuration)\n", " bert = MultiTaskModel(bert, \n", - " num_vocab=max([m['token'] for m in token_maps.values()]), \n", + " num_vocab=1 + max([m['token'] for m in token_maps.values()]), \n", " num_tokens=config['model_params']['vocab_size'],\n", " hidden_size=config['model_params']['hidden_size'])\n", " \n",