From adfff78d6965c686482318f7fde9424d290cc206 Mon Sep 17 00:00:00 2001 From: dsplog Date: Thu, 30 Nov 2023 08:30:33 +0530 Subject: [PATCH 01/36] using multilinguarl bert --- Configs/config.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Configs/config.yml b/Configs/config.yml index 75f60d1e..5c0f8f58 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -19,6 +19,8 @@ dataset_params: word_mask_prob: 0.15 # probability to mask the entire word phoneme_mask_prob: 0.1 # probability to mask each phoneme replace_prob: 0.2 # probablity to replace phonemes + + multilingual_tokenizer : "bert-base-multilingual-cased" model_params: vocab_size: 178 @@ -27,4 +29,4 @@ model_params: intermediate_size: 2048 max_position_embeddings: 512 num_hidden_layers: 12 - dropout: 0.1 \ No newline at end of file + dropout: 0.1 From 5ce542e1772c3ad8dcfdbbe870f9400c18962b11 Mon Sep 17 00:00:00 2001 From: dsplog Date: Thu, 30 Nov 2023 08:30:53 +0530 Subject: [PATCH 02/36] addign support for malayalam --- converters/Cardinal.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/converters/Cardinal.py b/converters/Cardinal.py index d1fc17c4..5d2e0e3e 100644 --- a/converters/Cardinal.py +++ b/converters/Cardinal.py @@ -4,6 +4,7 @@ import re from .Roman import Roman +from numbers_malayalam import expand_numbers @singleton class Cardinal: @@ -40,7 +41,7 @@ class Cardinal: - For example: "20" -> "two", in some situations. - These cases account to a total of 37 cases between the total 133744 CARDINAL tokens. """ - def __init__(self): + def __init__(self,language='en'): super().__init__() # Regex to remove non digits (spaces, commas etc.), but keep "-" self.filter_regex = re.compile("[^0-9\-]") @@ -114,6 +115,8 @@ def __init__(self): # Roman conversion self.roman = Roman() + self.language = language + def _give_chunk(self, num_str: str, size:int = 3) -> str: # While string not empty while num_str: @@ -187,7 +190,15 @@ def convert(self, token: str) -> str: text_list = chunk_text_list + text_list # 15 Join the list elements with spaces - token = " ".join(text_list) + if self.language == 'ml' : + try : + token = expand_numbers(int(token))[0] + except : + token = " ".join(text_list) + else : + token = " ".join(text_list) + + # 16 Apply pre and suffixes, if applicable if prefix: From 6da16b6514e8e4c6400a71bd7a71818a8ad5bc2c Mon Sep 17 00:00:00 2001 From: dsplog Date: Thu, 30 Nov 2023 08:31:27 +0530 Subject: [PATCH 03/36] with multilingual tokenizer, adding subword support, adding malayalam support --- phonemize.py | 60 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/phonemize.py b/phonemize.py index f6885572..9ec4baf0 100644 --- a/phonemize.py +++ b/phonemize.py @@ -1,5 +1,5 @@ import string -from text_normalize import normalize_text, remove_accents +from text_normalize import normalize_text, remove_accents, is_malayalam special_mappings = { "a": "ɐ", @@ -18,17 +18,39 @@ "doesn": "dˈʌzən", } -def phonemize(text, global_phonemizer, tokenizer): - text = normalize_text(remove_accents(text)) +def issubword(word) : + return word.startswith('##') + +def phonemize_word(word) : + # removing subword indicator ## from the string before phonemizing + if issubword(word) : + word = word[2:] + + phoneme = global_phonemizer.phonemize([word], strip=True)[0] + + if len(word) ==1 and is_malayalam(word) : + ''' + for single character unicode, epspeak ng is returning the language prefix + see issue : https://github.com/bootphon/phonemizer/issues/160 + removing the prefix "mæleɪˈɑːləm" + TODO : how to make it generic for any langauge + ''' + phoneme = phoneme[11:] + return phoneme + +def phonemize(text, global_phonemizer, tokenizer,language='en'): + text = normalize_text(remove_accents(text),language) words = tokenizer.tokenize(text) + ids = tokenizer.encode(text)[1:-1] - phonemes_bad = [global_phonemizer.phonemize([word], strip=True)[0] if word not in string.punctuation else word for word in words] + phonemes_bad = [ phonemize_word(word) if word not in string.punctuation else word for word in words] input_ids = [] phonemes = [] for i in range(len(words)): word = words[i] phoneme = phonemes_bad[i] + id = ids[i] for k, v in special_mappings.items(): if word == k: @@ -69,11 +91,33 @@ def phonemize(text, global_phonemizer, tokenizer): if "@" in word and len(word) > 1: # remove "@" if "@" in word and len(word) > 1: phonemes.append(word.replace('@', '')) - input_ids.append(tokenizer.encode(word.replace('@', ''))[0]) + input_ids.append(tokenizer.encode(word.replace('@', ''))[1]) continue - - input_ids.append(tokenizer.encode(word)[0]) + + input_ids.append(id) phonemes.append(phoneme) assert len(input_ids) == len(phonemes) - return {'input_ids' : input_ids, 'phonemes': phonemes} \ No newline at end of file + return {'input_ids' : input_ids, 'phonemes': phonemes} + +if __name__ == '__main__' : + from transformers import TransfoXLTokenizer + tname = "transfo-xl-wt103" + tokenizer = TransfoXLTokenizer.from_pretrained(tname) # you can use any other tokenizers if you want to + + from transformers import BertTokenizer + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + + import phonemizer + global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True,language_switch='remove-flags') + + text = 'hello my dear did you get the wrong @number 12 12.5' + #text = 'ഇവരുമായി സഹകരിക്കില്ലെന്നാണ് സംഘടയുടെ തീരുമാനം.' + #text = 'നെഗറ്റീവ് എനർജി’ വിവാദം !: ശിശുസംരക്ഷണ ഓഫീസർക്ക് സസ്പെൻഷൻ!' + from datasets import load_dataset + dataset = load_dataset("wikipedia", language="ml", date="20231101",beam_runner='DirectRunner')['train'] + text = dataset[1]['text'] + text = 'hello from (1200 - 1230 - 1240)' + dd = phonemize(text, global_phonemizer, tokenizer, language="ml") + pass + From 9983db5ede1af94b4dd8d3d7fba8675786a00366 Mon Sep 17 00:00:00 2001 From: dsplog Date: Thu, 30 Nov 2023 08:31:53 +0530 Subject: [PATCH 04/36] adding support for range i.e (10-20) cases, malayalam --- text_normalize.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/text_normalize.py b/text_normalize.py index 5e6b15b8..83166b81 100644 --- a/text_normalize.py +++ b/text_normalize.py @@ -5,6 +5,7 @@ import unicodedata import os, sys +import re from converters.Plain import Plain from converters.Punct import Punct @@ -23,6 +24,8 @@ from converters.Telephone import Telephone from converters.Address import Address from converters.Roman import Roman +from converters.Range import Range + months = ['jan', 'feb', @@ -64,7 +67,8 @@ "FRACTION": Fraction(), "TELEPHONE": Telephone(), "ADDRESS": Address(), - "ROMAN": Roman() + "ROMAN": Roman(), + "RANGE": Range() } def split_given_size(a, size): @@ -83,9 +87,16 @@ def normalize_split(text): return normalized_text.replace(" ' s", "'s") +def is_malayalam(word) : + ''' + returns match if string starts with malayalam unicode block + https://en.wikipedia.org/wiki/Malayalam_(Unicode_block) + ''' + return re.match('([\u0d00-\u0d7f\u200d]+)',word) + def remove_accents(input_str): nfkd_form = unicodedata.normalize('NFKD', input_str) - return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) + return u"".join([c for c in nfkd_form if not unicodedata.combining(c) or is_malayalam(c)]) def has_numbers(inputString): return any(char.isdigit() for char in inputString) @@ -108,6 +119,9 @@ def is_fraction(inputString): def is_decimal(inputString): return "." in inputString +def is_range(inputString) : + return "-" in inputString + def is_url(inputString): return "//" in inputString or ".com" in inputString or ".html" in inputString @@ -136,6 +150,8 @@ def normalize_single(text, prev_text = "", next_text = ""): text = labels['DECIMAL'].convert(text) elif is_cardinal(text): text = labels['CARDINAL'].convert(text) + elif is_range(text): + text = labels['RANGE'].convert(text) else: text = labels['DATE'].convert(text) @@ -146,9 +162,13 @@ def normalize_single(text, prev_text = "", next_text = ""): return text.replace("$", "") -def normalize_text(text): +def normalize_text(text,language='en'): text = remove_accents(text).replace('–', ' to ').replace('-', ' - ').replace(":p", ": p").replace(":P", ": P").replace(":d", ": d").replace(":D", ": D") words = word_tokenize(text) + + for label in labels : + labels[label].language = language + df = pd.DataFrame(words, columns=['before']) df['after'] = df['before'] @@ -157,4 +177,4 @@ def normalize_text(text): df['after'] = df['previous'].apply(lambda m: normalize_single(m.split('|')[1], m.split('|')[0], m.split('|')[2])) - return TreebankWordDetokenizer().detokenize(df['after'].tolist()).replace("’ s", "'s").replace(" 's", "'s") \ No newline at end of file + return TreebankWordDetokenizer().detokenize(df['after'].tolist()).replace("’ s", "'s").replace(" 's", "'s") From 2633144e01327301fe0db3717b61352a1e2e0238 Mon Sep 17 00:00:00 2001 From: dsplog Date: Thu, 30 Nov 2023 08:32:17 +0530 Subject: [PATCH 05/36] adding support for range i.e (10-20) cases, malayalam --- converters/Range.py | 42 +++++++++++++++++++++++++++ numbers_malayalam.py | 68 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 converters/Range.py create mode 100644 numbers_malayalam.py diff --git a/converters/Range.py b/converters/Range.py new file mode 100644 index 00000000..3111a830 --- /dev/null +++ b/converters/Range.py @@ -0,0 +1,42 @@ + +from singleton_decorator import singleton +import re +from .Cardinal import Cardinal + +@singleton +class Range: + """ + Steps: + - Check for - splitting numbers + + Note: + Punctuation always stays the same + """ + def __init__(self, language='en'): + super().__init__() + self.cardinal = Cardinal() + self.language = language + + def convert(self, token: str) -> str: + numbers = re.split('-', token) + if len(numbers) == 1 : + token = self.cardinal.convert(numbers[0]) + elif len(numbers) == 2 : + + if self.language == 'ml' : + token = self.cardinal.convert(numbers[0]) + token += ' മുതൽ ' + token += self.cardinal.convert(numbers[1]) + token += ' വരെ ' + else : + token = self.cardinal.convert(numbers[0]) + token += ' to ' + token += self.cardinal.convert(numbers[1]) + + else : + token = '' + for number in numbers : + token += self.cardinal.convert(number) + token += ' ' + + return token diff --git a/numbers_malayalam.py b/numbers_malayalam.py new file mode 100644 index 00000000..0af85cfc --- /dev/null +++ b/numbers_malayalam.py @@ -0,0 +1,68 @@ +from mlmorph import Generator +generator = Generator() + + +onesStr = [ + "പൂജ്യം", + "ഒന്ന്", + "രണ്ട്", + "മൂന്ന്", + "നാല്", + "അഞ്ച്", + "ആറ്", + "ഏഴ്", + "എട്ട്", + "ഒമ്പത്" +] + + +def clean(result) : + result = result.replace("", "") + result = result.replace("", "") + #result = result.replace("ഒന്ന്", "") # is it needed? + result = result.replace("ഒന്ന്", "") if result.startswith("ഒന്ന്") else result #to handle 11000 + return result + +def positionValues(value) : + result = "" + crores = int(value / 10000000) if (value >= 10000000) else 0 + lakhs = int((value % 10000000) / 100000) + thousands = int((value % 100000) / 1000) + hundreds = int((value % 1000) / 100) + tens = int((value % 100) / 10) + ones = int((value % 10) / 1) + result = ((positionValues(crores) + "") if (crores > 0) else "") + \ + ((positionValues(lakhs) + "") if (lakhs > 0) else "") + \ + ((positionValues(thousands) + "") if (thousands > 0) else "") + \ + ((positionValues(hundreds) + "") if (hundreds > 0) else "") + \ + ((positionValues(tens) + "") if (tens > 0) else "") + \ + ((onesStr[ones] + "") if (ones > 0) else "") + \ + ((onesStr[ones] + "") if (value == 0) else "") + return clean(result) + +def spellOut(value) : + return positionValues(value) + "" + +def expand_numbers(value,weight=False) : + numtext = spellOut(value) + out = generator.generate(numtext,weighted=weight) + return out + + + + + + + + + +if __name__ == '__main__' : + text = expand_numbers(110) + print(text) + pass + + #out = expand_numbers_ml(value) + + + + From be3473d48a6ba98524fd8e302229f0513f3f2af4 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 09:06:15 +0530 Subject: [PATCH 06/36] support for malayalam pre-processing --- preprocess.ipynb | 35 ++++++-- preprocess_converted.py | 180 ++++++++++++++++++++++++++++++++++++++++ text_normalize.py | 6 +- 3 files changed, 214 insertions(+), 7 deletions(-) create mode 100644 preprocess_converted.py diff --git a/preprocess.ipynb b/preprocess.ipynb index 1017addf..5004646b 100644 --- a/preprocess.ipynb +++ b/preprocess.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "c1d31f54", "metadata": {}, @@ -9,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "21eb8ed4", "metadata": {}, @@ -39,6 +41,14 @@ "from phonemize import phonemize" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "b83a77e4", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -47,7 +57,11 @@ "outputs": [], "source": [ "import phonemizer\n", - "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)" + "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', \n", + " preserve_punctuation=True, \n", + " with_stress=True,\n", + " language_switch='remove-flags'\n", + " )" ] }, { @@ -57,11 +71,15 @@ "metadata": {}, "outputs": [], "source": [ - "from transformers import TransfoXLTokenizer\n", - "tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to" + "#from transformers import TransfoXLTokenizer\n", + "#tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to\n", + "\n", + "from transformers import BertTokenizer\n", + "tokenizer = BertTokenizer.from_pretrained(config['dataset_params']['multilingual_tokenizer'])\n" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2eb25417", "metadata": {}, @@ -79,7 +97,8 @@ "outputs": [], "source": [ "from datasets import load_dataset\n", - "dataset = load_dataset(\"wikipedia\", \"20220301.en\")['train'] # you can use other version of this dataset" + "#dataset = load_dataset(\"wikipedia\", \"20220301.en\")['train'] # you can use other version of this dataset\n", + "dataset = load_dataset(\"wikipedia\", language=\"ml\", date=\"20231101\",beam_runner='DirectRunner')['train']" ] }, { @@ -89,7 +108,7 @@ "metadata": {}, "outputs": [], "source": [ - "root_directory = \"./wiki_phoneme\" # set up root directory for multiprocessor processing" + "root_directory = \"./wiki_ml_phoneme\" # set up root directory for multiprocessor processing" ] }, { @@ -127,6 +146,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "c21f9dcf", "metadata": {}, @@ -143,13 +163,14 @@ }, "outputs": [], "source": [ - "max_workers = 32 # change this to the number of CPU cores your machine has \n", + "max_workers = 3 # change this to the number of CPU cores your machine has \n", "\n", "with ProcessPool(max_workers=max_workers) as pool:\n", " pool.map(process_shard, range(num_shards), timeout=60)" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b78caee6", "metadata": {}, @@ -202,6 +223,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cdf6f6f6", "metadata": {}, @@ -310,6 +332,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1c9e968e", "metadata": {}, diff --git a/preprocess_converted.py b/preprocess_converted.py new file mode 100644 index 00000000..87840431 --- /dev/null +++ b/preprocess_converted.py @@ -0,0 +1,180 @@ +# %% [markdown] +# # Notebook for preprocessing Wikipedia (English) dataset + +# %% [markdown] +# ### Initilizing phonemizer and tokenizer + +# %% +import yaml + +config_path = "Configs/config.yml" # you can change it to anything else +config = yaml.safe_load(open(config_path)) + +# %% +from phonemize import phonemize + +# %% +import phonemizer +global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', + preserve_punctuation=True, + with_stress=True, + language_switch='remove-flags' + ) + +#from transformers import TransfoXLTokenizer +#tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to + +from transformers import BertTokenizer +tokenizer = BertTokenizer.from_pretrained(config['dataset_params']['multilingual_tokenizer']) +# %% [markdown] +# ### Process dataset + +# %% +from datasets import load_dataset +#dataset = load_dataset("wikipedia", "20220301.en")['train'] # you can use other version of this dataset +#dataset = load_dataset("wikipedia", "20220301.ml")['train'] # you can use other version of this dataset +#dataset = load_dataset("wikipedia", '20231120.ml', beam_runner='DirectRunner')['train'] +dataset = load_dataset("wikipedia", language="ml", date="20231101",beam_runner='DirectRunner')['train'] + + + + +# %% +root_directory = "./wiki_ml_phoneme" # set up root directory for multiprocessor processing + +# %% +import os +num_shards = 50000 + +def process_shard(i): + directory = root_directory + "/shard_" + str(i) + if os.path.exists(directory): + print("Shard %d already exists!" % i) + return + print('Processing shard %d ...' % i) + shard = dataset.shard(num_shards=num_shards, index=i) + processed_dataset = shard.map(lambda t: phonemize(t['text'], + global_phonemizer, + tokenizer, + language="ml" + ), + remove_columns=['text'], + ) + if not os.path.exists(directory): + os.makedirs(directory) + processed_dataset.save_to_disk(directory) + +# %% +from pebble import ProcessPool +from concurrent.futures import TimeoutError + +# %% [markdown] +# #### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed. +# + +# %% +max_workers = 3 # change this to the number of CPU cores your machine has + +with ProcessPool(max_workers=max_workers) as pool: + pool.map(process_shard, range(num_shards), timeout=300) + +# %% [markdown] +# ### Collect all shards to form the processed dataset + +# %% +from datasets import load_from_disk, concatenate_datasets + +output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))] +datasets = [] +for o in output: + directory = root_directory + "/" + o + try: + shard = load_from_disk(directory) + datasets.append(shard) + print("%s loaded" % o) + except: + continue + +# %% +dataset = concatenate_datasets(datasets) +dataset.save_to_disk(config['data_folder']) +print('Dataset saved to %s' % config['data_folder']) + +# %% +# check the dataset size +dataset + +# %% [markdown] +# ### Remove unneccessary tokens from the pre-trained tokenizer +# The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. + +# %% +from simple_loader import FilePathDataset, build_dataloader + +file_data = FilePathDataset(dataset) +loader = build_dataloader(file_data, num_workers=4, batch_size=128) + +# %% +special_token = config['dataset_params']['word_separator'] + +# %% +# get all unique tokens in the entire dataset + +from tqdm import tqdm + +unique_index = [special_token] +for _, batch in enumerate(tqdm(loader)): + unique_index.extend(batch) + unique_index = list(set(unique_index)) + +# %% +# get each token's lower case + +lower_tokens = [] +for t in tqdm(unique_index): + word = tokenizer.decode([t]) + if word.lower() != word: + t = tokenizer.encode([word.lower()])[0] + lower_tokens.append(t) + else: + lower_tokens.append(t) + +# %% +lower_tokens = (list(set(lower_tokens))) + +# %% +# redo the mapping for lower number of tokens + +token_maps = {} +for t in tqdm(unique_index): + word = tokenizer.decode([t]) + word = word.lower() + new_t = tokenizer.encode([word.lower()])[0] + token_maps[t] = {'word': word, 'token': lower_tokens.index(new_t)} + +# %% +import pickle +with open(config['dataset_params']['token_maps'], 'wb') as handle: + pickle.dump(token_maps, handle) +print('Token mapper saved to %s' % config['dataset_params']['token_maps']) + +# %% [markdown] +# ### Test the dataset with dataloader +# + +# %% +from dataloader import build_dataloader, FilePathDataset + +tmp = FilePathDataset(dataset, **config['dataset_params']) + +for k in range(len(tmp)) : + data = tmp[k] + +train_loader = build_dataloader(dataset, batch_size=32, num_workers=4, dataset_config=config['dataset_params']) + +# %% +_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader)) + +pass + + diff --git a/text_normalize.py b/text_normalize.py index 83166b81..84efcda2 100644 --- a/text_normalize.py +++ b/text_normalize.py @@ -92,7 +92,7 @@ def is_malayalam(word) : returns match if string starts with malayalam unicode block https://en.wikipedia.org/wiki/Malayalam_(Unicode_block) ''' - return re.match('([\u0d00-\u0d7f\u200d]+)',word) + return re.match('([\u0d00-\u0d7f]+)',word) def remove_accents(input_str): nfkd_form = unicodedata.normalize('NFKD', input_str) @@ -164,8 +164,12 @@ def normalize_single(text, prev_text = "", next_text = ""): def normalize_text(text,language='en'): text = remove_accents(text).replace('–', ' to ').replace('-', ' - ').replace(":p", ": p").replace(":P", ": P").replace(":d", ": d").replace(":D", ": D") + # removing zero-width-no-joiner which is seen in malayalam text + # https://en.wikipedia.org/wiki/Zero-width_non-joiner + text = text.replace('\u200c','') words = word_tokenize(text) + # hack - changing the language attribute for the conversion pipeline for label in labels : labels[label].language = language From 8fee41a78135567db1d9a6b87e313cb9bc73d54a Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 15:00:44 +0530 Subject: [PATCH 07/36] keeping default unchanged --- Configs/config.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Configs/config.yml b/Configs/config.yml index 5c0f8f58..472ffa34 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -20,7 +20,6 @@ dataset_params: phoneme_mask_prob: 0.1 # probability to mask each phoneme replace_prob: 0.2 # probablity to replace phonemes - multilingual_tokenizer : "bert-base-multilingual-cased" model_params: vocab_size: 178 @@ -28,5 +27,4 @@ model_params: num_attention_heads: 12 intermediate_size: 2048 max_position_embeddings: 512 - num_hidden_layers: 12 - dropout: 0.1 + num_hidden_layers: 12 \ No newline at end of file From 884a03129fd2a88c3deb83cf807a398c0859c054 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 15:01:32 +0530 Subject: [PATCH 08/36] keeping default unchanged --- Configs/config.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Configs/config.yml b/Configs/config.yml index 472ffa34..68d3c8df 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -27,4 +27,6 @@ model_params: num_attention_heads: 12 intermediate_size: 2048 max_position_embeddings: 512 - num_hidden_layers: 12 \ No newline at end of file + num_hidden_layers: 12 + +dropout: 0.1 \ No newline at end of file From 6ecf98b8cd3b2b24052cba6accc6194010352f0e Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 15:02:03 +0530 Subject: [PATCH 09/36] keeping default unchanged --- Configs/config.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Configs/config.yml b/Configs/config.yml index 68d3c8df..c31bc0e4 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -28,5 +28,4 @@ model_params: intermediate_size: 2048 max_position_embeddings: 512 num_hidden_layers: 12 - -dropout: 0.1 \ No newline at end of file + dropout: 0.1 \ No newline at end of file From cf62fcc533e6f11381630015e6a7c09dfcf29935 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 15:02:30 +0530 Subject: [PATCH 10/36] keeping default unchanged --- Configs/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/Configs/config.yml b/Configs/config.yml index c31bc0e4..c8cf91ac 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -20,7 +20,6 @@ dataset_params: phoneme_mask_prob: 0.1 # probability to mask each phoneme replace_prob: 0.2 # probablity to replace phonemes - model_params: vocab_size: 178 hidden_size: 768 From 235bf7961e5c675dfbc0d9061543cabecf5b065c Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 15:03:04 +0530 Subject: [PATCH 11/36] keeping default unchanged --- Configs/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/Configs/config.yml b/Configs/config.yml index c8cf91ac..f169511b 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -20,6 +20,7 @@ dataset_params: phoneme_mask_prob: 0.1 # probability to mask each phoneme replace_prob: 0.2 # probablity to replace phonemes + model_params: vocab_size: 178 hidden_size: 768 From f6f75c809a290e2d4f09348a45c8f9e2808001dd Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 15:03:30 +0530 Subject: [PATCH 12/36] keeping default unchanged --- Configs/config.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Configs/config.yml b/Configs/config.yml index f169511b..e912168f 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -19,8 +19,6 @@ dataset_params: word_mask_prob: 0.15 # probability to mask the entire word phoneme_mask_prob: 0.1 # probability to mask each phoneme replace_prob: 0.2 # probablity to replace phonemes - - model_params: vocab_size: 178 hidden_size: 768 From 5558f2518ac7f90e4214fe3350b1a2267ff66f51 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 15:03:52 +0530 Subject: [PATCH 13/36] keeping default unchanged --- Configs/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/Configs/config.yml b/Configs/config.yml index e912168f..c8cf91ac 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -19,6 +19,7 @@ dataset_params: word_mask_prob: 0.15 # probability to mask the entire word phoneme_mask_prob: 0.1 # probability to mask each phoneme replace_prob: 0.2 # probablity to replace phonemes + model_params: vocab_size: 178 hidden_size: 768 From 3a94f25c7f284b851e7c07db44ee71caf60bedcf Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 16:21:40 +0530 Subject: [PATCH 14/36] adding multilingual configurations --- Configs/config_multi_en.yml | 32 ++++++++++++++++++++++++++++++++ Configs/config_multi_ml.yml | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 Configs/config_multi_en.yml create mode 100644 Configs/config_multi_ml.yml diff --git a/Configs/config_multi_en.yml b/Configs/config_multi_en.yml new file mode 100644 index 00000000..275681ef --- /dev/null +++ b/Configs/config_multi_en.yml @@ -0,0 +1,32 @@ +log_dir: "Checkpoint" +mixed_precision: "fp16" +data_folder: "wikipedia_20220301.en.processed" +batch_size: 192 +save_interval: 5000 +log_interval: 10 +num_process: 1 # number of GPUs +num_steps: 1000000 + +dataset_params: + tokenizer: "bert-base-multilingual-cased" + token_separator: " " # token used for phoneme separator (space) + token_mask: "M" # token used for phoneme mask (M) + word_separator: 3039 # token used for word separator () + token_maps: "token_maps_multi_en.pkl" # token map path + language : "en" + mode : "multi" + + max_mel_length: 512 # max phoneme length + + word_mask_prob: 0.15 # probability to mask the entire word + phoneme_mask_prob: 0.1 # probability to mask each phoneme + replace_prob: 0.2 # probablity to replace phonemes + +model_params: + vocab_size: 178 + hidden_size: 768 + num_attention_heads: 12 + intermediate_size: 2048 + max_position_embeddings: 512 + num_hidden_layers: 12 + dropout: 0.1 diff --git a/Configs/config_multi_ml.yml b/Configs/config_multi_ml.yml new file mode 100644 index 00000000..e16c2085 --- /dev/null +++ b/Configs/config_multi_ml.yml @@ -0,0 +1,32 @@ +log_dir: "Checkpoint" +mixed_precision: "fp16" +data_folder: "wikipedia_20231101.ml" +batch_size: 192 +save_interval: 5000 +log_interval: 10 +num_process: 1 # number of GPUs +num_steps: 1000000 + +dataset_params: + tokenizer: "bert-base-multilingual-cased" + token_separator: " " # token used for phoneme separator (space) + token_mask: "M" # token used for phoneme mask (M) + word_separator: 3039 # token used for word separator () + token_maps: "token_maps_multi_ml.pkl" # token map path + language : "ml" + mode : "multi" + + max_mel_length: 512 # max phoneme length + + word_mask_prob: 0.15 # probability to mask the entire word + phoneme_mask_prob: 0.1 # probability to mask each phoneme + replace_prob: 0.2 # probablity to replace phonemes + +model_params: + vocab_size: 178 + hidden_size: 768 + num_attention_heads: 12 + intermediate_size: 2048 + max_position_embeddings: 512 + num_hidden_layers: 12 + dropout: 0.1 From b7145b1b2e2e640c7acdb66c1d16ac64b4cafa45 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 16:24:04 +0530 Subject: [PATCH 15/36] correcting readme --- README.md | 2 +- preprocess_converted.py => preprocess.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename preprocess_converted.py => preprocess.py (100%) diff --git a/README.md b/README.md index 11ac49a4..3e3705f2 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ pip install pandas singleton-decorator datasets "transformers<4.33.3" accelerate ``` ## Preprocessing -Please refer to the notebook [preprocess.ipynb](https://github.com/yl4579/PL-BERT/blob/main/preprocess.ipynb) for more details. The preprocessing is for English Wikipedia dataset only. I will make a new branch for Japanese if I have extra time to demostrate training on other languages. You may also refer to [#6](https://github.com/yl4579/PL-BERT/issues/6#issuecomment-1797869275) for preprocessing in other languages like Japanese. +Please refer to the notebook [preprocess.ipynb](https://github.com/yl4579/PL-BERT/blob/main/preprocess.ipynb) or [preprocess.py](https://github.com/yl4579/PL-BERT/blob/main/preprocess.py) for more details. The preprocessing is for English Wikipedia dataset only. I will make a new branch for Japanese if I have extra time to demostrate training on other languages. You may also refer to [#6](https://github.com/yl4579/PL-BERT/issues/6#issuecomment-1797869275) for preprocessing in other languages like Japanese. ## Trianing Please run each cell in the notebook [train.ipynb](https://github.com/yl4579/PL-BERT/blob/main/train.ipynb). You will need to change the line diff --git a/preprocess_converted.py b/preprocess.py similarity index 100% rename from preprocess_converted.py rename to preprocess.py From 1d79ef2fdebfe287da599cb97a51326b8b5ec77b Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 16:24:38 +0530 Subject: [PATCH 16/36] baselines --- preprocess.ipynb | 17 ++++++++++++++--- preprocess.py | 27 +++++++++++++++++---------- text_normalize.py | 6 ++++++ 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/preprocess.ipynb b/preprocess.ipynb index 5004646b..3fc3f0a0 100644 --- a/preprocess.ipynb +++ b/preprocess.ipynb @@ -28,7 +28,13 @@ "import yaml\n", "\n", "config_path = \"Configs/config.yml\" # you can change it to anything else\n", - "config = yaml.safe_load(open(config_path))" + "# config_path = \"Configs/config_multi_en.yml\" # multilingual english\n", + "# config_path = \"Configs/config_multi_ml.yml\" # multilingual malayalam\n", + "config = yaml.safe_load(open(config_path))\n", + "\n", + "\n", + "language = config['dataset_params'].get('language','en')\n", + "mode = config['dataset_params'].get('mode','mono-en')" ] }, { @@ -75,7 +81,7 @@ "#tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to\n", "\n", "from transformers import BertTokenizer\n", - "tokenizer = BertTokenizer.from_pretrained(config['dataset_params']['multilingual_tokenizer'])\n" + "tokenizer = BertTokenizer.from_pretrained(config['dataset_params']['tokenizer'])\n" ] }, { @@ -128,7 +134,12 @@ " return\n", " print('Processing shard %d ...' % i)\n", " shard = dataset.shard(num_shards=num_shards, index=i)\n", - " processed_dataset = shard.map(lambda t: phonemize(t['text'], global_phonemizer, tokenizer), remove_columns=['text'])\n", + " processed_dataset = shard.map(lambda t: phonemize(t['text'], \n", + " global_phonemizer, \n", + " tokenizer,\n", + " language=\"ml\"\n", + " ), \n", + " remove_columns=['text'])\n", " if not os.path.exists(directory):\n", " os.makedirs(directory)\n", " processed_dataset.save_to_disk(directory)" diff --git a/preprocess.py b/preprocess.py index 87840431..290dc090 100644 --- a/preprocess.py +++ b/preprocess.py @@ -10,6 +10,9 @@ config_path = "Configs/config.yml" # you can change it to anything else config = yaml.safe_load(open(config_path)) +language = config['dataset_params'].get('language','en') +mode = config['dataset_params'].get('mode','mono-en') + # %% from phonemize import phonemize @@ -21,26 +24,30 @@ language_switch='remove-flags' ) -#from transformers import TransfoXLTokenizer -#tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to -from transformers import BertTokenizer -tokenizer = BertTokenizer.from_pretrained(config['dataset_params']['multilingual_tokenizer']) +if mode == 'mono-en' : + from transformers import TransfoXLTokenizer + tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to +elif 'multi' in mode : + from transformers import BertTokenizer + tokenizer = BertTokenizer.from_pretrained(config['dataset_params']['tokenizer']) + # %% [markdown] # ### Process dataset # %% from datasets import load_dataset -#dataset = load_dataset("wikipedia", "20220301.en")['train'] # you can use other version of this dataset -#dataset = load_dataset("wikipedia", "20220301.ml")['train'] # you can use other version of this dataset -#dataset = load_dataset("wikipedia", '20231120.ml', beam_runner='DirectRunner')['train'] -dataset = load_dataset("wikipedia", language="ml", date="20231101",beam_runner='DirectRunner')['train'] +src, suffix = config['data_folder'].split('_') +if 'en' in suffix : + dataset = load_dataset(src, "20220301.en")['train'] # you can use other version of this dataset +elif 'ml' in suffix : + dataset = load_dataset(src, language="ml", date="20231101",beam_runner='DirectRunner')['train'] # %% -root_directory = "./wiki_ml_phoneme" # set up root directory for multiprocessor processing +root_directory = f"./wiki_{language}_phoneme" # set up root directory for multiprocessor processing # %% import os @@ -56,7 +63,7 @@ def process_shard(i): processed_dataset = shard.map(lambda t: phonemize(t['text'], global_phonemizer, tokenizer, - language="ml" + language=language ), remove_columns=['text'], ) diff --git a/text_normalize.py b/text_normalize.py index 84efcda2..a0223acf 100644 --- a/text_normalize.py +++ b/text_normalize.py @@ -182,3 +182,9 @@ def normalize_text(text,language='en'): df['after'] = df['previous'].apply(lambda m: normalize_single(m.split('|')[1], m.split('|')[0], m.split('|')[2])) return TreebankWordDetokenizer().detokenize(df['after'].tolist()).replace("’ s", "'s").replace(" 's", "'s") + +if __name__ == '__main__' : + text = 'hello (1200 - 1230)' + out = normalize_text(text,language='ml') + print(out) + From 5bc400955ff86ea45f4e02aca1552e12380506e6 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 16:25:37 +0530 Subject: [PATCH 17/36] baselines --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3e3705f2..c23e6033 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ pip install pandas singleton-decorator datasets "transformers<4.33.3" accelerate ``` ## Preprocessing -Please refer to the notebook [preprocess.ipynb](https://github.com/yl4579/PL-BERT/blob/main/preprocess.ipynb) or [preprocess.py](https://github.com/yl4579/PL-BERT/blob/main/preprocess.py) for more details. The preprocessing is for English Wikipedia dataset only. I will make a new branch for Japanese if I have extra time to demostrate training on other languages. You may also refer to [#6](https://github.com/yl4579/PL-BERT/issues/6#issuecomment-1797869275) for preprocessing in other languages like Japanese. +Please refer to the notebook [preprocess.ipynb](https://github.com/yl4579/PL-BERT/blob/main/preprocess.ipynb) or [preprocess.py](https://github.com/dsplog/PL-BERT/blob/main/preprocess.py) for more details. The preprocessing is for English Wikipedia dataset only. I will make a new branch for Japanese if I have extra time to demostrate training on other languages. You may also refer to [#6](https://github.com/yl4579/PL-BERT/issues/6#issuecomment-1797869275) for preprocessing in other languages like Japanese. ## Trianing Please run each cell in the notebook [train.ipynb](https://github.com/yl4579/PL-BERT/blob/main/train.ipynb). You will need to change the line From 13b5f03990351d82bb35a6b7c18fafe22546a808 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 2 Dec 2023 16:43:28 +0530 Subject: [PATCH 18/36] adding missed global_phonemizer --- phonemize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phonemize.py b/phonemize.py index 9ec4baf0..a4b9be78 100644 --- a/phonemize.py +++ b/phonemize.py @@ -21,7 +21,7 @@ def issubword(word) : return word.startswith('##') -def phonemize_word(word) : +def phonemize_word(global_phonemizer, word) : # removing subword indicator ## from the string before phonemizing if issubword(word) : word = word[2:] @@ -43,7 +43,7 @@ def phonemize(text, global_phonemizer, tokenizer,language='en'): words = tokenizer.tokenize(text) ids = tokenizer.encode(text)[1:-1] - phonemes_bad = [ phonemize_word(word) if word not in string.punctuation else word for word in words] + phonemes_bad = [ phonemize_word(global_phonemizer, word) if word not in string.punctuation else word for word in words] input_ids = [] phonemes = [] From e0a29606a7b8e8af27529c93bda586894c58d67a Mon Sep 17 00:00:00 2001 From: dsplog Date: Sun, 3 Dec 2023 09:44:05 +0530 Subject: [PATCH 19/36] adding support for decimal numbers in malayalam --- converters/Decimal.py | 16 ++++++++++++---- converters/Digit.py | 26 +++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/converters/Decimal.py b/converters/Decimal.py index abbb72a0..0bb91630 100644 --- a/converters/Decimal.py +++ b/converters/Decimal.py @@ -24,7 +24,7 @@ class Decimal: Edge cases: 3.66E-49 -> three point six six times ten to the minus fourty nine """ - def __init__(self): + def __init__(self,language='en'): super().__init__() # Regex to detect input of the sort "x.y" or ".y" self.decimal_regex = re.compile(r"(-?\d*)\.(\d+)(.*)") @@ -60,7 +60,8 @@ def __init__(self): self.suffix_regex = re.compile(f" *({'|'.join(self.suffixes)})") # Regular expression for xEy self.e_suffix_regex = re.compile(r" *E(-?\d+)") - + self.language = language + def convert(self, token: str) -> str: # 1 Filter out commas @@ -103,9 +104,16 @@ def convert(self, token: str) -> str: # 6, 7 Only if the decimal is 0, and there is a number in front of the dot, and there is no suffix # then we use "zero" instead of "o". if len(decimal) > 0: - result_list.append("point") + if self.language == 'en' : + result_list.append("point") + elif self.language == 'ml' : + result_list.append("ദശാംശം") if decimal == "0" and len(number) > 0 and len(suffix) == 0: - result_list.append("zero") + if self.language == 'en' : + result_list.append("zero") + elif self.language == 'ml' : + result_list.append("പൂജ്യം") + else: # 8 Otherwise use Digit conversion result_list.append(self.digit.convert(decimal)) diff --git a/converters/Digit.py b/converters/Digit.py index accb4d4b..d81c7d00 100644 --- a/converters/Digit.py +++ b/converters/Digit.py @@ -16,7 +16,7 @@ class Digit: 007 -> double o 7 while 003 -> o o 3 """ - def __init__(self): + def __init__(self,language='en'): super().__init__() # Regex used to filter out non digits self.filter_regex = re.compile("[^0-9]") @@ -33,13 +33,33 @@ def __init__(self): "8": "eight", "9": "nine" } + # Translation dict to convert digits to text + self.trans_dict_malayalam = { + "0": "പൂജ്യം", + "1": "ഒന്ന്", + "2": "രണ്ട്", + "3": "മൂന്ന്", + "4": "നാല്", + "5": "അഞ്ച്", + "6": "ആറ്", + "7": "ഏഴ്", + "8": "എട്ട്", + "9": "ഒമ്പത്" + } + def convert(self, token: str) -> str: # 1 Filter out anything that isn't a digit token = self.filter_regex.sub("", token) # 2 Check for special case if token == "007": - return "double o seven" + if self.language == 'en' : + return "double o seven" + elif self.language == 'ml' : + return "ഡബിൾ ഓ സെവൻ" # 3 & 4 Convert each digit to text and space out the text - token = " ".join([self.trans_dict[c] for c in token]) + if self.language == 'en' : + token = " ".join([self.trans_dict[c] for c in token]) + elif self.language == 'ml' : + token = " ".join([self.trans_dict_malayalam[c] for c in token]) return token From afa71f690d54afdd33f5c922f39f0975faccd4fb Mon Sep 17 00:00:00 2001 From: dsplog Date: Fri, 8 Dec 2023 08:09:17 +0530 Subject: [PATCH 20/36] adding support for malayalam --- converters/Date.py | 7 ++++--- converters/Ordinal.py | 3 ++- text_normalize.py | 6 +++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/converters/Date.py b/converters/Date.py index 678ee3a4..902788e9 100644 --- a/converters/Date.py +++ b/converters/Date.py @@ -28,7 +28,7 @@ class Date: Note: This converters essentially uses regular expressions only. The regular expressions could be used to classify the data as well. """ - def __init__(self): + def __init__(self,language='en'): super().__init__() # Regex to remove dots self.filter_regex = re.compile(r"[,']") @@ -115,6 +115,7 @@ def __init__(self): # Cardinal and Ordinal conversion self.cardinal = Cardinal() self.ordinal = Ordinal() + self.language = language def convert(self, token: str) -> str: @@ -153,9 +154,9 @@ def construct_output(): # If we want the "the D of M Y" format if dmy: if day: - result_list.append("the") + if self.language =='en' : result_list.append("the") result_list.append(day) - result_list.append("of") + if self.language =='en' : result_list.append("of") result_list.append(month) else: # Otherwise use "M D Y" format diff --git a/converters/Ordinal.py b/converters/Ordinal.py index f1a77496..065ae484 100644 --- a/converters/Ordinal.py +++ b/converters/Ordinal.py @@ -127,7 +127,8 @@ def convert(self, token: str) -> str: # Convert the number to cardinal style, and convert the last word to # the ordinal style using self.trans_denominator. number_text_list = self.cardinal.convert(token).split(" ") - number_text_list[-1] = self.trans_denominator[number_text_list[-1]] + if self.language == 'en' : + number_text_list[-1] = self.trans_denominator[number_text_list[-1]] result = " ".join(number_text_list) # 6 Apply pre- and suffixes, if applicable diff --git a/text_normalize.py b/text_normalize.py index a0223acf..7b4e6e7d 100644 --- a/text_normalize.py +++ b/text_normalize.py @@ -133,10 +133,10 @@ def normalize_single(text, prev_text = "", next_text = ""): text = labels['ELECTRONIC'].convert(text).upper() elif has_numbers(text): if has_month(prev_text): - prev_text = prev_text.lower() + prev_text = labels['DATE'].get_month(prev_text.lower()) text = labels['DATE'].convert(prev_text + " " + text).replace(prev_text, "").strip() elif has_month(next_text): - next_text = next_text.lower() + next_text = labels['DATE'].get_month(next_text.lower()) text = labels['DATE'].convert(text + " " + next_text).replace(next_text, "").strip() elif is_oridinal(text): text = labels['ORDINAL'].convert(text) @@ -184,7 +184,7 @@ def normalize_text(text,language='en'): return TreebankWordDetokenizer().detokenize(df['after'].tolist()).replace("’ s", "'s").replace(" 's", "'s") if __name__ == '__main__' : - text = 'hello (1200 - 1230)' + text = 'hello (23 Jan 2020, 12:10 AM)' out = normalize_text(text,language='ml') print(out) From 3556dd54521c0766a5b944d6160600a9d1260770 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 9 Dec 2023 10:07:56 +0530 Subject: [PATCH 21/36] fix for range and date --- numbers_malayalam.py | 68 -------------------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 numbers_malayalam.py diff --git a/numbers_malayalam.py b/numbers_malayalam.py deleted file mode 100644 index 0af85cfc..00000000 --- a/numbers_malayalam.py +++ /dev/null @@ -1,68 +0,0 @@ -from mlmorph import Generator -generator = Generator() - - -onesStr = [ - "പൂജ്യം", - "ഒന്ന്", - "രണ്ട്", - "മൂന്ന്", - "നാല്", - "അഞ്ച്", - "ആറ്", - "ഏഴ്", - "എട്ട്", - "ഒമ്പത്" -] - - -def clean(result) : - result = result.replace("", "") - result = result.replace("", "") - #result = result.replace("ഒന്ന്", "") # is it needed? - result = result.replace("ഒന്ന്", "") if result.startswith("ഒന്ന്") else result #to handle 11000 - return result - -def positionValues(value) : - result = "" - crores = int(value / 10000000) if (value >= 10000000) else 0 - lakhs = int((value % 10000000) / 100000) - thousands = int((value % 100000) / 1000) - hundreds = int((value % 1000) / 100) - tens = int((value % 100) / 10) - ones = int((value % 10) / 1) - result = ((positionValues(crores) + "") if (crores > 0) else "") + \ - ((positionValues(lakhs) + "") if (lakhs > 0) else "") + \ - ((positionValues(thousands) + "") if (thousands > 0) else "") + \ - ((positionValues(hundreds) + "") if (hundreds > 0) else "") + \ - ((positionValues(tens) + "") if (tens > 0) else "") + \ - ((onesStr[ones] + "") if (ones > 0) else "") + \ - ((onesStr[ones] + "") if (value == 0) else "") - return clean(result) - -def spellOut(value) : - return positionValues(value) + "" - -def expand_numbers(value,weight=False) : - numtext = spellOut(value) - out = generator.generate(numtext,weighted=weight) - return out - - - - - - - - - -if __name__ == '__main__' : - text = expand_numbers(110) - print(text) - pass - - #out = expand_numbers_ml(value) - - - - From e25716995f345f2d65b46612252a460e1539e3b4 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 9 Dec 2023 10:14:19 +0530 Subject: [PATCH 22/36] removing multilingaul things --- Configs/config_multi_en.yml | 32 -------------------------------- Configs/config_multi_ml.yml | 32 -------------------------------- 2 files changed, 64 deletions(-) delete mode 100644 Configs/config_multi_en.yml delete mode 100644 Configs/config_multi_ml.yml diff --git a/Configs/config_multi_en.yml b/Configs/config_multi_en.yml deleted file mode 100644 index 275681ef..00000000 --- a/Configs/config_multi_en.yml +++ /dev/null @@ -1,32 +0,0 @@ -log_dir: "Checkpoint" -mixed_precision: "fp16" -data_folder: "wikipedia_20220301.en.processed" -batch_size: 192 -save_interval: 5000 -log_interval: 10 -num_process: 1 # number of GPUs -num_steps: 1000000 - -dataset_params: - tokenizer: "bert-base-multilingual-cased" - token_separator: " " # token used for phoneme separator (space) - token_mask: "M" # token used for phoneme mask (M) - word_separator: 3039 # token used for word separator () - token_maps: "token_maps_multi_en.pkl" # token map path - language : "en" - mode : "multi" - - max_mel_length: 512 # max phoneme length - - word_mask_prob: 0.15 # probability to mask the entire word - phoneme_mask_prob: 0.1 # probability to mask each phoneme - replace_prob: 0.2 # probablity to replace phonemes - -model_params: - vocab_size: 178 - hidden_size: 768 - num_attention_heads: 12 - intermediate_size: 2048 - max_position_embeddings: 512 - num_hidden_layers: 12 - dropout: 0.1 diff --git a/Configs/config_multi_ml.yml b/Configs/config_multi_ml.yml deleted file mode 100644 index e16c2085..00000000 --- a/Configs/config_multi_ml.yml +++ /dev/null @@ -1,32 +0,0 @@ -log_dir: "Checkpoint" -mixed_precision: "fp16" -data_folder: "wikipedia_20231101.ml" -batch_size: 192 -save_interval: 5000 -log_interval: 10 -num_process: 1 # number of GPUs -num_steps: 1000000 - -dataset_params: - tokenizer: "bert-base-multilingual-cased" - token_separator: " " # token used for phoneme separator (space) - token_mask: "M" # token used for phoneme mask (M) - word_separator: 3039 # token used for word separator () - token_maps: "token_maps_multi_ml.pkl" # token map path - language : "ml" - mode : "multi" - - max_mel_length: 512 # max phoneme length - - word_mask_prob: 0.15 # probability to mask the entire word - phoneme_mask_prob: 0.1 # probability to mask each phoneme - replace_prob: 0.2 # probablity to replace phonemes - -model_params: - vocab_size: 178 - hidden_size: 768 - num_attention_heads: 12 - intermediate_size: 2048 - max_position_embeddings: 512 - num_hidden_layers: 12 - dropout: 0.1 From 0002a503f7ac8877718d8faf9368c5b96f57113a Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 9 Dec 2023 10:17:07 +0530 Subject: [PATCH 23/36] fix --- phonemize.py | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/phonemize.py b/phonemize.py index a4b9be78..f29c9258 100644 --- a/phonemize.py +++ b/phonemize.py @@ -1,5 +1,5 @@ import string -from text_normalize import normalize_text, remove_accents, is_malayalam +from text_normalize import normalize_text, remove_accents special_mappings = { "a": "ɐ", @@ -18,39 +18,18 @@ "doesn": "dˈʌzən", } -def issubword(word) : - return word.startswith('##') -def phonemize_word(global_phonemizer, word) : - # removing subword indicator ## from the string before phonemizing - if issubword(word) : - word = word[2:] - - phoneme = global_phonemizer.phonemize([word], strip=True)[0] - - if len(word) ==1 and is_malayalam(word) : - ''' - for single character unicode, epspeak ng is returning the language prefix - see issue : https://github.com/bootphon/phonemizer/issues/160 - removing the prefix "mæleɪˈɑːləm" - TODO : how to make it generic for any langauge - ''' - phoneme = phoneme[11:] - return phoneme - -def phonemize(text, global_phonemizer, tokenizer,language='en'): - text = normalize_text(remove_accents(text),language) +def phonemize(text, global_phonemizer, tokenizer): + text = normalize_text(remove_accents(text)) words = tokenizer.tokenize(text) - ids = tokenizer.encode(text)[1:-1] - phonemes_bad = [ phonemize_word(global_phonemizer, word) if word not in string.punctuation else word for word in words] + phonemes_bad = [ global_phonemizer.phonemize([word], strip=True)[0] if word not in string.punctuation else word for word in words] input_ids = [] phonemes = [] for i in range(len(words)): word = words[i] phoneme = phonemes_bad[i] - id = ids[i] for k, v in special_mappings.items(): if word == k: @@ -91,7 +70,7 @@ def phonemize(text, global_phonemizer, tokenizer,language='en'): if "@" in word and len(word) > 1: # remove "@" if "@" in word and len(word) > 1: phonemes.append(word.replace('@', '')) - input_ids.append(tokenizer.encode(word.replace('@', ''))[1]) + input_ids.append(tokenizer.encode(word.replace('@', ''))[0]) continue input_ids.append(id) @@ -113,11 +92,12 @@ def phonemize(text, global_phonemizer, tokenizer,language='en'): text = 'hello my dear did you get the wrong @number 12 12.5' #text = 'ഇവരുമായി സഹകരിക്കില്ലെന്നാണ് സംഘടയുടെ തീരുമാനം.' - #text = 'നെഗറ്റീവ് എനർജി’ വിവാദം !: ശിശുസംരക്ഷണ ഓഫീസർക്ക് സസ്പെൻഷൻ!' + text = 'നെഗറ്റീവ് എനർജി’ വിവാദം !' from datasets import load_dataset dataset = load_dataset("wikipedia", language="ml", date="20231101",beam_runner='DirectRunner')['train'] text = dataset[1]['text'] text = 'hello from (1200 - 1230 - 1240)' + text = 'ദേശീയോദ്യാനങ്ങൾ സംരക്ഷിതപ്രദേശങ്ങളാണ്.' dd = phonemize(text, global_phonemizer, tokenizer, language="ml") pass From d3d6ccf207578de8f4aa5ae071ba78289535e832 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 9 Dec 2023 10:17:49 +0530 Subject: [PATCH 24/36] fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c23e6033..11ac49a4 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ pip install pandas singleton-decorator datasets "transformers<4.33.3" accelerate ``` ## Preprocessing -Please refer to the notebook [preprocess.ipynb](https://github.com/yl4579/PL-BERT/blob/main/preprocess.ipynb) or [preprocess.py](https://github.com/dsplog/PL-BERT/blob/main/preprocess.py) for more details. The preprocessing is for English Wikipedia dataset only. I will make a new branch for Japanese if I have extra time to demostrate training on other languages. You may also refer to [#6](https://github.com/yl4579/PL-BERT/issues/6#issuecomment-1797869275) for preprocessing in other languages like Japanese. +Please refer to the notebook [preprocess.ipynb](https://github.com/yl4579/PL-BERT/blob/main/preprocess.ipynb) for more details. The preprocessing is for English Wikipedia dataset only. I will make a new branch for Japanese if I have extra time to demostrate training on other languages. You may also refer to [#6](https://github.com/yl4579/PL-BERT/issues/6#issuecomment-1797869275) for preprocessing in other languages like Japanese. ## Trianing Please run each cell in the notebook [train.ipynb](https://github.com/yl4579/PL-BERT/blob/main/train.ipynb). You will need to change the line From 755896365fac68c5228bb5fe239b09ac88fbba9b Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 9 Dec 2023 10:19:15 +0530 Subject: [PATCH 25/36] fix --- phonemize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/phonemize.py b/phonemize.py index f29c9258..2107641a 100644 --- a/phonemize.py +++ b/phonemize.py @@ -1,5 +1,5 @@ import string -from text_normalize import normalize_text, remove_accents +from text_normalize import normalize_text, remove_accents special_mappings = { "a": "ɐ", @@ -23,7 +23,7 @@ def phonemize(text, global_phonemizer, tokenizer): text = normalize_text(remove_accents(text)) words = tokenizer.tokenize(text) - phonemes_bad = [ global_phonemizer.phonemize([word], strip=True)[0] if word not in string.punctuation else word for word in words] + phonemes_bad = [global_phonemizer.phonemize([word], strip=True)[0] if word not in string.punctuation else word for word in words] input_ids = [] phonemes = [] @@ -73,7 +73,7 @@ def phonemize(text, global_phonemizer, tokenizer): input_ids.append(tokenizer.encode(word.replace('@', ''))[0]) continue - input_ids.append(id) + input_ids.append(tokenizer.encode(word)[0]) phonemes.append(phoneme) assert len(input_ids) == len(phonemes) From 139bbbd6f0e9507bfe1468120c6cbbe0f1be463f Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 9 Dec 2023 10:20:16 +0530 Subject: [PATCH 26/36] fix --- phonemize.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/phonemize.py b/phonemize.py index 2107641a..79bc1d9a 100644 --- a/phonemize.py +++ b/phonemize.py @@ -84,20 +84,12 @@ def phonemize(text, global_phonemizer, tokenizer): tname = "transfo-xl-wt103" tokenizer = TransfoXLTokenizer.from_pretrained(tname) # you can use any other tokenizers if you want to - from transformers import BertTokenizer - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - import phonemizer - global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True,language_switch='remove-flags') + global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True) text = 'hello my dear did you get the wrong @number 12 12.5' - #text = 'ഇവരുമായി സഹകരിക്കില്ലെന്നാണ് സംഘടയുടെ തീരുമാനം.' - text = 'നെഗറ്റീവ് എനർജി’ വിവാദം !' - from datasets import load_dataset - dataset = load_dataset("wikipedia", language="ml", date="20231101",beam_runner='DirectRunner')['train'] - text = dataset[1]['text'] text = 'hello from (1200 - 1230 - 1240)' text = 'ദേശീയോദ്യാനങ്ങൾ സംരക്ഷിതപ്രദേശങ്ങളാണ്.' - dd = phonemize(text, global_phonemizer, tokenizer, language="ml") + dd = phonemize(text, global_phonemizer, tokenizer) pass From 4cb495f76d1ccf81a82dbe2fafaa33a3a275fa89 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sat, 9 Dec 2023 10:21:42 +0530 Subject: [PATCH 27/36] fix --- converters/Cardinal.py | 12 ++---------- converters/Date.py | 7 +++---- converters/Decimal.py | 14 +++----------- converters/Digit.py | 26 +++----------------------- converters/Ordinal.py | 3 +-- 5 files changed, 12 insertions(+), 50 deletions(-) diff --git a/converters/Cardinal.py b/converters/Cardinal.py index 5d2e0e3e..a73064d9 100644 --- a/converters/Cardinal.py +++ b/converters/Cardinal.py @@ -4,7 +4,6 @@ import re from .Roman import Roman -from numbers_malayalam import expand_numbers @singleton class Cardinal: @@ -41,7 +40,7 @@ class Cardinal: - For example: "20" -> "two", in some situations. - These cases account to a total of 37 cases between the total 133744 CARDINAL tokens. """ - def __init__(self,language='en'): + def __init__(self): super().__init__() # Regex to remove non digits (spaces, commas etc.), but keep "-" self.filter_regex = re.compile("[^0-9\-]") @@ -115,7 +114,6 @@ def __init__(self,language='en'): # Roman conversion self.roman = Roman() - self.language = language def _give_chunk(self, num_str: str, size:int = 3) -> str: # While string not empty @@ -190,13 +188,7 @@ def convert(self, token: str) -> str: text_list = chunk_text_list + text_list # 15 Join the list elements with spaces - if self.language == 'ml' : - try : - token = expand_numbers(int(token))[0] - except : - token = " ".join(text_list) - else : - token = " ".join(text_list) + token = " ".join(text_list) diff --git a/converters/Date.py b/converters/Date.py index 902788e9..678ee3a4 100644 --- a/converters/Date.py +++ b/converters/Date.py @@ -28,7 +28,7 @@ class Date: Note: This converters essentially uses regular expressions only. The regular expressions could be used to classify the data as well. """ - def __init__(self,language='en'): + def __init__(self): super().__init__() # Regex to remove dots self.filter_regex = re.compile(r"[,']") @@ -115,7 +115,6 @@ def __init__(self,language='en'): # Cardinal and Ordinal conversion self.cardinal = Cardinal() self.ordinal = Ordinal() - self.language = language def convert(self, token: str) -> str: @@ -154,9 +153,9 @@ def construct_output(): # If we want the "the D of M Y" format if dmy: if day: - if self.language =='en' : result_list.append("the") + result_list.append("the") result_list.append(day) - if self.language =='en' : result_list.append("of") + result_list.append("of") result_list.append(month) else: # Otherwise use "M D Y" format diff --git a/converters/Decimal.py b/converters/Decimal.py index 0bb91630..da955f3e 100644 --- a/converters/Decimal.py +++ b/converters/Decimal.py @@ -24,7 +24,7 @@ class Decimal: Edge cases: 3.66E-49 -> three point six six times ten to the minus fourty nine """ - def __init__(self,language='en'): + def __init__(self): super().__init__() # Regex to detect input of the sort "x.y" or ".y" self.decimal_regex = re.compile(r"(-?\d*)\.(\d+)(.*)") @@ -60,7 +60,6 @@ def __init__(self,language='en'): self.suffix_regex = re.compile(f" *({'|'.join(self.suffixes)})") # Regular expression for xEy self.e_suffix_regex = re.compile(r" *E(-?\d+)") - self.language = language def convert(self, token: str) -> str: @@ -104,16 +103,9 @@ def convert(self, token: str) -> str: # 6, 7 Only if the decimal is 0, and there is a number in front of the dot, and there is no suffix # then we use "zero" instead of "o". if len(decimal) > 0: - if self.language == 'en' : - result_list.append("point") - elif self.language == 'ml' : - result_list.append("ദശാംശം") + result_list.append("point") if decimal == "0" and len(number) > 0 and len(suffix) == 0: - if self.language == 'en' : - result_list.append("zero") - elif self.language == 'ml' : - result_list.append("പൂജ്യം") - + result_list.append("zero") else: # 8 Otherwise use Digit conversion result_list.append(self.digit.convert(decimal)) diff --git a/converters/Digit.py b/converters/Digit.py index d81c7d00..accb4d4b 100644 --- a/converters/Digit.py +++ b/converters/Digit.py @@ -16,7 +16,7 @@ class Digit: 007 -> double o 7 while 003 -> o o 3 """ - def __init__(self,language='en'): + def __init__(self): super().__init__() # Regex used to filter out non digits self.filter_regex = re.compile("[^0-9]") @@ -33,33 +33,13 @@ def __init__(self,language='en'): "8": "eight", "9": "nine" } - # Translation dict to convert digits to text - self.trans_dict_malayalam = { - "0": "പൂജ്യം", - "1": "ഒന്ന്", - "2": "രണ്ട്", - "3": "മൂന്ന്", - "4": "നാല്", - "5": "അഞ്ച്", - "6": "ആറ്", - "7": "ഏഴ്", - "8": "എട്ട്", - "9": "ഒമ്പത്" - } - def convert(self, token: str) -> str: # 1 Filter out anything that isn't a digit token = self.filter_regex.sub("", token) # 2 Check for special case if token == "007": - if self.language == 'en' : - return "double o seven" - elif self.language == 'ml' : - return "ഡബിൾ ഓ സെവൻ" + return "double o seven" # 3 & 4 Convert each digit to text and space out the text - if self.language == 'en' : - token = " ".join([self.trans_dict[c] for c in token]) - elif self.language == 'ml' : - token = " ".join([self.trans_dict_malayalam[c] for c in token]) + token = " ".join([self.trans_dict[c] for c in token]) return token diff --git a/converters/Ordinal.py b/converters/Ordinal.py index 065ae484..f1a77496 100644 --- a/converters/Ordinal.py +++ b/converters/Ordinal.py @@ -127,8 +127,7 @@ def convert(self, token: str) -> str: # Convert the number to cardinal style, and convert the last word to # the ordinal style using self.trans_denominator. number_text_list = self.cardinal.convert(token).split(" ") - if self.language == 'en' : - number_text_list[-1] = self.trans_denominator[number_text_list[-1]] + number_text_list[-1] = self.trans_denominator[number_text_list[-1]] result = " ".join(number_text_list) # 6 Apply pre- and suffixes, if applicable From 70f3dc46fa03cf7e8fec79336225f05733caa125 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sun, 10 Dec 2023 11:13:29 +0530 Subject: [PATCH 28/36] fix --- Configs/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Configs/config.yml b/Configs/config.yml index c8cf91ac..12a85cb3 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -19,7 +19,7 @@ dataset_params: word_mask_prob: 0.15 # probability to mask the entire word phoneme_mask_prob: 0.1 # probability to mask each phoneme replace_prob: 0.2 # probablity to replace phonemes - + model_params: vocab_size: 178 hidden_size: 768 @@ -27,4 +27,4 @@ model_params: intermediate_size: 2048 max_position_embeddings: 512 num_hidden_layers: 12 - dropout: 0.1 \ No newline at end of file + dropout: 0.1 From e74e681f05e02930e52d0d1047ca2e86175a900a Mon Sep 17 00:00:00 2001 From: dsplog Date: Sun, 10 Dec 2023 11:15:11 +0530 Subject: [PATCH 29/36] fix --- Configs/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Configs/config.yml b/Configs/config.yml index 12a85cb3..75f60d1e 100644 --- a/Configs/config.yml +++ b/Configs/config.yml @@ -27,4 +27,4 @@ model_params: intermediate_size: 2048 max_position_embeddings: 512 num_hidden_layers: 12 - dropout: 0.1 + dropout: 0.1 \ No newline at end of file From b95ed66c91875add2f3b1b22bbbe4b42c1de0119 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sun, 10 Dec 2023 11:16:11 +0530 Subject: [PATCH 30/36] fix --- converters/Cardinal.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/converters/Cardinal.py b/converters/Cardinal.py index a73064d9..d1fc17c4 100644 --- a/converters/Cardinal.py +++ b/converters/Cardinal.py @@ -114,7 +114,6 @@ def __init__(self): # Roman conversion self.roman = Roman() - def _give_chunk(self, num_str: str, size:int = 3) -> str: # While string not empty while num_str: @@ -190,8 +189,6 @@ def convert(self, token: str) -> str: # 15 Join the list elements with spaces token = " ".join(text_list) - - # 16 Apply pre and suffixes, if applicable if prefix: token = f"{prefix} {token}" From fa93ea437e2f4de31efbcfa092a9fc11f019feea Mon Sep 17 00:00:00 2001 From: dsplog Date: Sun, 10 Dec 2023 11:16:43 +0530 Subject: [PATCH 31/36] fix --- converters/Decimal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/converters/Decimal.py b/converters/Decimal.py index da955f3e..abbb72a0 100644 --- a/converters/Decimal.py +++ b/converters/Decimal.py @@ -60,7 +60,7 @@ def __init__(self): self.suffix_regex = re.compile(f" *({'|'.join(self.suffixes)})") # Regular expression for xEy self.e_suffix_regex = re.compile(r" *E(-?\d+)") - + def convert(self, token: str) -> str: # 1 Filter out commas From 51829445356c62f9926c159a0d802d077d9e17d0 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sun, 10 Dec 2023 11:17:12 +0530 Subject: [PATCH 32/36] fix --- converters/Range.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/converters/Range.py b/converters/Range.py index 3111a830..9bbf8e02 100644 --- a/converters/Range.py +++ b/converters/Range.py @@ -12,10 +12,9 @@ class Range: Note: Punctuation always stays the same """ - def __init__(self, language='en'): + def __init__(self): super().__init__() self.cardinal = Cardinal() - self.language = language def convert(self, token: str) -> str: numbers = re.split('-', token) @@ -23,15 +22,9 @@ def convert(self, token: str) -> str: token = self.cardinal.convert(numbers[0]) elif len(numbers) == 2 : - if self.language == 'ml' : - token = self.cardinal.convert(numbers[0]) - token += ' മുതൽ ' - token += self.cardinal.convert(numbers[1]) - token += ' വരെ ' - else : - token = self.cardinal.convert(numbers[0]) - token += ' to ' - token += self.cardinal.convert(numbers[1]) + token = self.cardinal.convert(numbers[0]) + token += ' to ' + token += self.cardinal.convert(numbers[1]) else : token = '' From 29a5ff1790b5ee368455c830ab50523dd3dd9c3b Mon Sep 17 00:00:00 2001 From: dsplog Date: Sun, 10 Dec 2023 11:18:36 +0530 Subject: [PATCH 33/36] fix --- text_normalize.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/text_normalize.py b/text_normalize.py index 7b4e6e7d..0ca7e9bf 100644 --- a/text_normalize.py +++ b/text_normalize.py @@ -87,16 +87,9 @@ def normalize_split(text): return normalized_text.replace(" ' s", "'s") -def is_malayalam(word) : - ''' - returns match if string starts with malayalam unicode block - https://en.wikipedia.org/wiki/Malayalam_(Unicode_block) - ''' - return re.match('([\u0d00-\u0d7f]+)',word) - def remove_accents(input_str): nfkd_form = unicodedata.normalize('NFKD', input_str) - return u"".join([c for c in nfkd_form if not unicodedata.combining(c) or is_malayalam(c)]) + return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) def has_numbers(inputString): return any(char.isdigit() for char in inputString) @@ -162,17 +155,10 @@ def normalize_single(text, prev_text = "", next_text = ""): return text.replace("$", "") -def normalize_text(text,language='en'): +def normalize_text(text): text = remove_accents(text).replace('–', ' to ').replace('-', ' - ').replace(":p", ": p").replace(":P", ": P").replace(":d", ": d").replace(":D", ": D") - # removing zero-width-no-joiner which is seen in malayalam text - # https://en.wikipedia.org/wiki/Zero-width_non-joiner - text = text.replace('\u200c','') words = word_tokenize(text) - # hack - changing the language attribute for the conversion pipeline - for label in labels : - labels[label].language = language - df = pd.DataFrame(words, columns=['before']) df['after'] = df['before'] @@ -185,6 +171,6 @@ def normalize_text(text,language='en'): if __name__ == '__main__' : text = 'hello (23 Jan 2020, 12:10 AM)' - out = normalize_text(text,language='ml') + out = normalize_text(text) print(out) From d76fb5da5e6f45d25f122358f58b86b92293169e Mon Sep 17 00:00:00 2001 From: dsplog Date: Sun, 10 Dec 2023 11:19:34 +0530 Subject: [PATCH 34/36] fix --- phonemize.py | 20 ++----------------- preprocess.ipynb | 50 ++++++++---------------------------------------- 2 files changed, 10 insertions(+), 60 deletions(-) diff --git a/phonemize.py b/phonemize.py index 79bc1d9a..f6885572 100644 --- a/phonemize.py +++ b/phonemize.py @@ -18,7 +18,6 @@ "doesn": "dˈʌzən", } - def phonemize(text, global_phonemizer, tokenizer): text = normalize_text(remove_accents(text)) words = tokenizer.tokenize(text) @@ -72,24 +71,9 @@ def phonemize(text, global_phonemizer, tokenizer): phonemes.append(word.replace('@', '')) input_ids.append(tokenizer.encode(word.replace('@', ''))[0]) continue - + input_ids.append(tokenizer.encode(word)[0]) phonemes.append(phoneme) assert len(input_ids) == len(phonemes) - return {'input_ids' : input_ids, 'phonemes': phonemes} - -if __name__ == '__main__' : - from transformers import TransfoXLTokenizer - tname = "transfo-xl-wt103" - tokenizer = TransfoXLTokenizer.from_pretrained(tname) # you can use any other tokenizers if you want to - - import phonemizer - global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True) - - text = 'hello my dear did you get the wrong @number 12 12.5' - text = 'hello from (1200 - 1230 - 1240)' - text = 'ദേശീയോദ്യാനങ്ങൾ സംരക്ഷിതപ്രദേശങ്ങളാണ്.' - dd = phonemize(text, global_phonemizer, tokenizer) - pass - + return {'input_ids' : input_ids, 'phonemes': phonemes} \ No newline at end of file diff --git a/preprocess.ipynb b/preprocess.ipynb index 3fc3f0a0..1017addf 100644 --- a/preprocess.ipynb +++ b/preprocess.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "c1d31f54", "metadata": {}, @@ -10,7 +9,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "21eb8ed4", "metadata": {}, @@ -28,13 +26,7 @@ "import yaml\n", "\n", "config_path = \"Configs/config.yml\" # you can change it to anything else\n", - "# config_path = \"Configs/config_multi_en.yml\" # multilingual english\n", - "# config_path = \"Configs/config_multi_ml.yml\" # multilingual malayalam\n", - "config = yaml.safe_load(open(config_path))\n", - "\n", - "\n", - "language = config['dataset_params'].get('language','en')\n", - "mode = config['dataset_params'].get('mode','mono-en')" + "config = yaml.safe_load(open(config_path))" ] }, { @@ -47,14 +39,6 @@ "from phonemize import phonemize" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "b83a77e4", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -63,11 +47,7 @@ "outputs": [], "source": [ "import phonemizer\n", - "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', \n", - " preserve_punctuation=True, \n", - " with_stress=True,\n", - " language_switch='remove-flags'\n", - " )" + "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)" ] }, { @@ -77,15 +57,11 @@ "metadata": {}, "outputs": [], "source": [ - "#from transformers import TransfoXLTokenizer\n", - "#tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to\n", - "\n", - "from transformers import BertTokenizer\n", - "tokenizer = BertTokenizer.from_pretrained(config['dataset_params']['tokenizer'])\n" + "from transformers import TransfoXLTokenizer\n", + "tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2eb25417", "metadata": {}, @@ -103,8 +79,7 @@ "outputs": [], "source": [ "from datasets import load_dataset\n", - "#dataset = load_dataset(\"wikipedia\", \"20220301.en\")['train'] # you can use other version of this dataset\n", - "dataset = load_dataset(\"wikipedia\", language=\"ml\", date=\"20231101\",beam_runner='DirectRunner')['train']" + "dataset = load_dataset(\"wikipedia\", \"20220301.en\")['train'] # you can use other version of this dataset" ] }, { @@ -114,7 +89,7 @@ "metadata": {}, "outputs": [], "source": [ - "root_directory = \"./wiki_ml_phoneme\" # set up root directory for multiprocessor processing" + "root_directory = \"./wiki_phoneme\" # set up root directory for multiprocessor processing" ] }, { @@ -134,12 +109,7 @@ " return\n", " print('Processing shard %d ...' % i)\n", " shard = dataset.shard(num_shards=num_shards, index=i)\n", - " processed_dataset = shard.map(lambda t: phonemize(t['text'], \n", - " global_phonemizer, \n", - " tokenizer,\n", - " language=\"ml\"\n", - " ), \n", - " remove_columns=['text'])\n", + " processed_dataset = shard.map(lambda t: phonemize(t['text'], global_phonemizer, tokenizer), remove_columns=['text'])\n", " if not os.path.exists(directory):\n", " os.makedirs(directory)\n", " processed_dataset.save_to_disk(directory)" @@ -157,7 +127,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "c21f9dcf", "metadata": {}, @@ -174,14 +143,13 @@ }, "outputs": [], "source": [ - "max_workers = 3 # change this to the number of CPU cores your machine has \n", + "max_workers = 32 # change this to the number of CPU cores your machine has \n", "\n", "with ProcessPool(max_workers=max_workers) as pool:\n", " pool.map(process_shard, range(num_shards), timeout=60)" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "b78caee6", "metadata": {}, @@ -234,7 +202,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "cdf6f6f6", "metadata": {}, @@ -343,7 +310,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "1c9e968e", "metadata": {}, From a7325b316428e55ccbc1d03e719e05d8f60fc365 Mon Sep 17 00:00:00 2001 From: dsplog Date: Sun, 10 Dec 2023 11:20:13 +0530 Subject: [PATCH 35/36] fix --- preprocess.py | 187 -------------------------------------------------- 1 file changed, 187 deletions(-) delete mode 100644 preprocess.py diff --git a/preprocess.py b/preprocess.py deleted file mode 100644 index 290dc090..00000000 --- a/preprocess.py +++ /dev/null @@ -1,187 +0,0 @@ -# %% [markdown] -# # Notebook for preprocessing Wikipedia (English) dataset - -# %% [markdown] -# ### Initilizing phonemizer and tokenizer - -# %% -import yaml - -config_path = "Configs/config.yml" # you can change it to anything else -config = yaml.safe_load(open(config_path)) - -language = config['dataset_params'].get('language','en') -mode = config['dataset_params'].get('mode','mono-en') - -# %% -from phonemize import phonemize - -# %% -import phonemizer -global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', - preserve_punctuation=True, - with_stress=True, - language_switch='remove-flags' - ) - - -if mode == 'mono-en' : - from transformers import TransfoXLTokenizer - tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to -elif 'multi' in mode : - from transformers import BertTokenizer - tokenizer = BertTokenizer.from_pretrained(config['dataset_params']['tokenizer']) - -# %% [markdown] -# ### Process dataset - -# %% -from datasets import load_dataset - - -src, suffix = config['data_folder'].split('_') -if 'en' in suffix : - dataset = load_dataset(src, "20220301.en")['train'] # you can use other version of this dataset -elif 'ml' in suffix : - dataset = load_dataset(src, language="ml", date="20231101",beam_runner='DirectRunner')['train'] - - -# %% -root_directory = f"./wiki_{language}_phoneme" # set up root directory for multiprocessor processing - -# %% -import os -num_shards = 50000 - -def process_shard(i): - directory = root_directory + "/shard_" + str(i) - if os.path.exists(directory): - print("Shard %d already exists!" % i) - return - print('Processing shard %d ...' % i) - shard = dataset.shard(num_shards=num_shards, index=i) - processed_dataset = shard.map(lambda t: phonemize(t['text'], - global_phonemizer, - tokenizer, - language=language - ), - remove_columns=['text'], - ) - if not os.path.exists(directory): - os.makedirs(directory) - processed_dataset.save_to_disk(directory) - -# %% -from pebble import ProcessPool -from concurrent.futures import TimeoutError - -# %% [markdown] -# #### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed. -# - -# %% -max_workers = 3 # change this to the number of CPU cores your machine has - -with ProcessPool(max_workers=max_workers) as pool: - pool.map(process_shard, range(num_shards), timeout=300) - -# %% [markdown] -# ### Collect all shards to form the processed dataset - -# %% -from datasets import load_from_disk, concatenate_datasets - -output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))] -datasets = [] -for o in output: - directory = root_directory + "/" + o - try: - shard = load_from_disk(directory) - datasets.append(shard) - print("%s loaded" % o) - except: - continue - -# %% -dataset = concatenate_datasets(datasets) -dataset.save_to_disk(config['data_folder']) -print('Dataset saved to %s' % config['data_folder']) - -# %% -# check the dataset size -dataset - -# %% [markdown] -# ### Remove unneccessary tokens from the pre-trained tokenizer -# The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. - -# %% -from simple_loader import FilePathDataset, build_dataloader - -file_data = FilePathDataset(dataset) -loader = build_dataloader(file_data, num_workers=4, batch_size=128) - -# %% -special_token = config['dataset_params']['word_separator'] - -# %% -# get all unique tokens in the entire dataset - -from tqdm import tqdm - -unique_index = [special_token] -for _, batch in enumerate(tqdm(loader)): - unique_index.extend(batch) - unique_index = list(set(unique_index)) - -# %% -# get each token's lower case - -lower_tokens = [] -for t in tqdm(unique_index): - word = tokenizer.decode([t]) - if word.lower() != word: - t = tokenizer.encode([word.lower()])[0] - lower_tokens.append(t) - else: - lower_tokens.append(t) - -# %% -lower_tokens = (list(set(lower_tokens))) - -# %% -# redo the mapping for lower number of tokens - -token_maps = {} -for t in tqdm(unique_index): - word = tokenizer.decode([t]) - word = word.lower() - new_t = tokenizer.encode([word.lower()])[0] - token_maps[t] = {'word': word, 'token': lower_tokens.index(new_t)} - -# %% -import pickle -with open(config['dataset_params']['token_maps'], 'wb') as handle: - pickle.dump(token_maps, handle) -print('Token mapper saved to %s' % config['dataset_params']['token_maps']) - -# %% [markdown] -# ### Test the dataset with dataloader -# - -# %% -from dataloader import build_dataloader, FilePathDataset - -tmp = FilePathDataset(dataset, **config['dataset_params']) - -for k in range(len(tmp)) : - data = tmp[k] - -train_loader = build_dataloader(dataset, batch_size=32, num_workers=4, dataset_config=config['dataset_params']) - -# %% -_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader)) - -pass - - From 85bd63859072235df7d18faef9bc9f1ba01bf504 Mon Sep 17 00:00:00 2001 From: "Aaron (Yinghao) Li" <71044569+yl4579@users.noreply.github.com> Date: Mon, 8 Jan 2024 00:21:37 -0500 Subject: [PATCH 36/36] Update train.ipynb Fix #40 --- train.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.ipynb b/train.ipynb index 9cc2f4b4..a7becc3d 100644 --- a/train.ipynb +++ b/train.ipynb @@ -117,7 +117,7 @@ " \n", " bert = AlbertModel(albert_base_configuration)\n", " bert = MultiTaskModel(bert, \n", - " num_vocab=max([m['token'] for m in token_maps.values()]), \n", + " num_vocab=1 + max([m['token'] for m in token_maps.values()]), \n", " num_tokens=config['model_params']['vocab_size'],\n", " hidden_size=config['model_params']['hidden_size'])\n", " \n",