From 29bfdc91dfb999139bd6e153fbb2f57d5cb9c946 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 14 Sep 2021 09:38:24 +0200 Subject: [PATCH 01/62] Add script for downloading GLUE data (source: https://gist.github.com/vlasenkoalexey/fef1601580f269eca73bf26a198595f3) --- TinyBERT/scripts/download_glue_data.py | 154 +++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 TinyBERT/scripts/download_glue_data.py diff --git a/TinyBERT/scripts/download_glue_data.py b/TinyBERT/scripts/download_glue_data.py new file mode 100644 index 00000000..17c1a1f8 --- /dev/null +++ b/TinyBERT/scripts/download_glue_data.py @@ -0,0 +1,154 @@ +''' Script for downloading all GLUE data. + +Note: for legal reasons, we are unable to host MRPC. +You can either use the version hosted by the SentEval team, which is already tokenized, +or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually. +For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example). +You should then rename and place specific files in a folder (see below for an example). + +mkdir MRPC +cabextract MSRParaphraseCorpus.msi -d MRPC +cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt +cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt +rm MRPC/_* +rm MSRParaphraseCorpus.msi + +1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now. +2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray! +''' + +import argparse +import io +import os +import sys +import shutil +import tempfile +import urllib.request +import zipfile + +URLLIB = urllib.request + +TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"] +TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip', + "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip', + "QQP":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip', + "STS":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip', + "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip', + "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip', + "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip', + "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip', + "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv', + 'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'} + +MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt' +MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt' + +def download_and_extract(task, data_dir): + print("Downloading and extracting %s..." % task) + if task == "MNLI": + print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.") + data_file = "%s.zip" % task + urllib.request.urlretrieve(TASK2PATH[task], data_file) + with zipfile.ZipFile(data_file) as zip_ref: + zip_ref.extractall(data_dir) + os.remove(data_file) + print("\tCompleted!") + +def format_mrpc(data_dir, path_to_data): + print("Processing MRPC...") + mrpc_dir = os.path.join(data_dir, "MRPC") + if not os.path.isdir(mrpc_dir): + os.mkdir(mrpc_dir) + if path_to_data: + mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt") + mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt") + else: + try: + mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt") + mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt") + URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) + URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file) + except urllib.error.HTTPError: + print("Error downloading MRPC") + return + assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file + assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file + + with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \ + io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh: + header = data_fh.readline() + test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n") + for idx, row in enumerate(data_fh): + label, id1, id2, s1, s2 = row.strip().split('\t') + test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2)) + + try: + URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv")) + except KeyError or urllib.error.HTTPError: + print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.") + return + + dev_ids = [] + with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh: + for row in ids_fh: + dev_ids.append(row.strip().split('\t')) + + with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \ + io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \ + io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh: + header = data_fh.readline() + train_fh.write(header) + dev_fh.write(header) + for row in data_fh: + label, id1, id2, s1, s2 = row.strip().split('\t') + if [id1, id2] in dev_ids: + dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) + else: + train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) + + print("\tCompleted!") + +def download_diagnostic(data_dir): + print("Downloading and extracting diagnostic...") + if not os.path.isdir(os.path.join(data_dir, "diagnostic")): + os.mkdir(os.path.join(data_dir, "diagnostic")) + data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv") + urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file) + print("\tCompleted!") + return + +def get_tasks(task_names): + task_names = task_names.split(',') + if "all" in task_names: + tasks = TASKS + else: + tasks = [] + for task_name in task_names: + assert task_name in TASKS, "Task %s not found!" % task_name + tasks.append(task_name) + return tasks + +def main(arguments): + parser = argparse.ArgumentParser() + parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data') + parser.add_argument('--tasks', help='tasks to download data for as a comma separated string', + type=str, default='all') + parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt', + type=str, default='') + args = parser.parse_args(arguments) + + if not os.path.isdir(args.data_dir): + os.mkdir(args.data_dir) + tasks = get_tasks(args.tasks) + + for task in tasks: + if task == 'MRPC': + format_mrpc(args.data_dir, args.path_to_mrpc) + elif task == 'diagnostic': + download_diagnostic(args.data_dir) + else: + download_and_extract(task, args.data_dir) + + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:])) \ No newline at end of file From c2dceb7802376fbbcdc46a156b433fbb8e223d66 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 14 Sep 2021 09:39:04 +0200 Subject: [PATCH 02/62] Add blank data folder --- TinyBERT/data/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 TinyBERT/data/.gitkeep diff --git a/TinyBERT/data/.gitkeep b/TinyBERT/data/.gitkeep new file mode 100644 index 00000000..e69de29b From b05fee5d1451ac1100d739a737a06bc14172aa2a Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 14 Sep 2021 11:15:08 +0200 Subject: [PATCH 03/62] Comment logging candidate words --- TinyBERT/data_augmentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TinyBERT/data_augmentation.py b/TinyBERT/data_augmentation.py index b817b865..77dd989f 100644 --- a/TinyBERT/data_augmentation.py +++ b/TinyBERT/data_augmentation.py @@ -208,7 +208,7 @@ def augment(self, sent): for (idx, word) in enumerate(tokens): if _is_valid(word) and word not in StopWordsList: candidate_words[idx] = self._word_augment(sent, idx, word) - logger.info(candidate_words) + # logger.info(candidate_words) cnt = 0 while cnt < self.N: new_sent = list(tokens) From 45a26dad2e9a8dd6097ff7272dff0a91f27ce48d Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 14 Sep 2021 11:25:54 +0200 Subject: [PATCH 04/62] Log info about augmentation status more frequently --- TinyBERT/data_augmentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TinyBERT/data_augmentation.py b/TinyBERT/data_augmentation.py index 77dd989f..c9d6224f 100644 --- a/TinyBERT/data_augmentation.py +++ b/TinyBERT/data_augmentation.py @@ -261,7 +261,7 @@ def read_augment_write(self): line[augment_id] = augment_sent writer.writerow(line) - if (i+1) % 1000 == 0: + if (i+1) % 50 == 0: logger.info("Having been processing {} examples".format(str(i+1))) From 780cdc30e9e7ff7ef4f3ebc3e86c33d0778ec887 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 14 Sep 2021 13:43:56 +0200 Subject: [PATCH 05/62] Log info about loading GloVe embeddings --- TinyBERT/data_augmentation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TinyBERT/data_augmentation.py b/TinyBERT/data_augmentation.py index c9d6224f..ee043444 100644 --- a/TinyBERT/data_augmentation.py +++ b/TinyBERT/data_augmentation.py @@ -89,6 +89,7 @@ def _read_tsv(input_file, quotechar=None): def prepare_embedding_retrieval(glove_file, vocab_size=100000): + logger.info('Preparing GloVe embedding started') cnt = 0 words = [] embeddings = {} @@ -117,6 +118,7 @@ def prepare_embedding_retrieval(glove_file, vocab_size=100000): # normalize each word vector d = (np.sum(emb_matrix ** 2, 1) ** 0.5) emb_norm = (emb_matrix.T / d).T + logger.info('Preparing GloVe embedding finished') return emb_norm, vocab, ids_to_tokens From bc9394d45cc1bae1f8b2c360a4d2823cab7de20f Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 14 Sep 2021 13:44:25 +0200 Subject: [PATCH 06/62] Add script for standard fine-tuning of pretrained models --- TinyBERT/data_processing.py | 573 ++++++++++++++++++++++++++++++++++++ TinyBERT/fine_tune_bert.py | 465 +++++++++++++++++++++++++++++ TinyBERT/task_distill.py | 569 +---------------------------------- 3 files changed, 1041 insertions(+), 566 deletions(-) create mode 100644 TinyBERT/data_processing.py create mode 100644 TinyBERT/fine_tune_bert.py diff --git a/TinyBERT/data_processing.py b/TinyBERT/data_processing.py new file mode 100644 index 00000000..497dc65c --- /dev/null +++ b/TinyBERT/data_processing.py @@ -0,0 +1,573 @@ +import csv +import os +import sys + +import torch +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import f1_score, matthews_corrcoef +from torch.utils.data import TensorDataset + +from task_distill import logger + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.seq_length = seq_length + self.label_id = label_id + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + if sys.version_info[0] == 2: + line = list(unicode(cell, 'utf-8') for cell in line) + lines.append(line) + return lines + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = line[4] + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[8] + text_b = line[9] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliMismatchedProcessor(MnliProcessor): + """Processor for the MultiNLI Mismatched data set (GLUE version).""" + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), + "dev_matched") + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line[3] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class Sst2Processor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[0] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class StsbProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return [None] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[7] + text_b = line[8] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QqpProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + try: + text_a = line[3] + text_b = line[4] + label = line[5] + except IndexError: + continue + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QnliProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), + "dev_matched") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class RteProcessor(DataProcessor): + """Processor for the RTE data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_aug_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class WnliProcessor(DataProcessor): + """Processor for the WNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer, output_mode): + """Loads a data file into a list of `InputBatch`s.""" + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:(max_seq_length - 2)] + + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + segment_ids = [0] * len(tokens) + + if tokens_b: + tokens += tokens_b + ["[SEP]"] + segment_ids += [1] * (len(tokens_b) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + seq_length = len(input_ids) + + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = float(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 1: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join( + [str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("label: {}".format(example.label)) + logger.info("label_id: {}".format(label_id)) + + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + seq_length=seq_length)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def simple_accuracy(preds, labels): + return (preds == labels).mean() + + +def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = f1_score(y_true=labels, y_pred=preds) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + +def pearson_and_spearman(preds, labels): + pearson_corr = pearsonr(preds, labels)[0] + spearman_corr = spearmanr(preds, labels)[0] + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + "corr": (pearson_corr + spearman_corr) / 2, + } + + +def compute_metrics(task_name, preds, labels): + assert len(preds) == len(labels) + if task_name == "cola": + return {"mcc": matthews_corrcoef(labels, preds)} + elif task_name == "sst-2": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mrpc": + return acc_and_f1(preds, labels) + elif task_name == "sts-b": + return pearson_and_spearman(preds, labels) + elif task_name == "qqp": + return acc_and_f1(preds, labels) + elif task_name == "mnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mnli-mm": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "qnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "rte": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "wnli": + return {"acc": simple_accuracy(preds, labels)} + else: + raise KeyError(task_name) + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_label_ids, all_seq_lengths) + return tensor_data, all_label_ids + + +processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mnli-mm": MnliMismatchedProcessor, + "mrpc": MrpcProcessor, + "sst-2": Sst2Processor, + "sts-b": StsbProcessor, + "qqp": QqpProcessor, + "qnli": QnliProcessor, + "rte": RteProcessor, + "wnli": WnliProcessor +} + +output_modes = { + "cola": "classification", + "mnli": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification" +} diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py new file mode 100644 index 00000000..b129c982 --- /dev/null +++ b/TinyBERT/fine_tune_bert.py @@ -0,0 +1,465 @@ +# coding=utf-8 +# 2019.12.2-Changed for TinyBERT task-specific distillation +# Huawei Technologies Co., Ltd. +# Copyright 2020 Huawei Technologies Co., Ltd. +# Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import csv +import logging +import os +import random +import sys + +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler) +from tqdm import tqdm, trange + +from torch.nn import CrossEntropyLoss, MSELoss + +from data_processing import convert_examples_to_features, \ + compute_metrics, get_tensor_data, processors, output_modes +from transformer.modeling import TinyBertForSequenceClassification +from transformer.tokenization import BertTokenizer +from transformer.optimization import BertAdam +from transformer.file_utils import WEIGHTS_NAME, CONFIG_NAME + +csv.field_size_limit(sys.maxsize) + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +fh = logging.FileHandler('debug_layer_loss.log') +fh.setFormatter(logging.Formatter(log_format)) +logging.getLogger().addHandler(fh) +logger = logging.getLogger() + +oncloud = True +try: + import moxing as mox +except: + oncloud = False + + +def result_to_file(result, file_name): + with open(file_name, "a") as writer: + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +def do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels): + eval_loss = 0 + nb_eval_steps = 0 + preds = [] + + for batch_ in tqdm(eval_dataloader, desc="Evaluating"): + batch_ = tuple(t.to(device) for t in batch_) + with torch.no_grad(): + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ + + logits, _, _ = model(input_ids, segment_ids, input_mask) + + # create eval loss and other metric required by the task + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + if len(preds) == 0: + preds.append(logits.detach().cpu().numpy()) + else: + preds[0] = np.append( + preds[0], logits.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + + preds = preds[0] + if output_mode == "classification": + preds = np.argmax(preds, axis=1) + elif output_mode == "regression": + preds = np.squeeze(preds) + result = compute_metrics(task_name, preds, eval_labels.numpy()) + result['eval_loss'] = eval_loss + + return result + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--pretrained_model", + default=None, + type=str, + help="The pretrained model dir.") + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=32, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument('--weight_decay', '--wd', + default=1e-4, + type=float, + metavar='W', + help='weight decay') + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + + # added arguments + parser.add_argument('--aug_train', + action='store_true') + parser.add_argument('--eval_step', + type=int, + default=50) + parser.add_argument('--data_url', + type=str, + default="") + + args = parser.parse_args() + logger.info('The args: {}'.format(args)) + + # intermediate distillation default parameters + default_params = { + "cola": {"num_train_epochs": 50, "max_seq_length": 64}, + "mnli": {"num_train_epochs": 5, "max_seq_length": 128}, + "mrpc": {"num_train_epochs": 20, "max_seq_length": 128}, + "sst-2": {"num_train_epochs": 10, "max_seq_length": 64}, + "sts-b": {"num_train_epochs": 20, "max_seq_length": 128}, + "qqp": {"num_train_epochs": 5, "max_seq_length": 128}, + "qnli": {"num_train_epochs": 10, "max_seq_length": 128}, + "rte": {"num_train_epochs": 20, "max_seq_length": 128} + } + + acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] + corr_tasks = ["sts-b"] + mcc_tasks = ["cola"] + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) + + logger.info("device: {} n_gpu: {}".format(device, n_gpu)) + + # Prepare seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + # Prepare task settings + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + task_name = args.task_name.lower() + + if task_name in default_params: + args.max_seq_len = default_params[task_name]["max_seq_length"] + + if not args.do_eval: + if task_name in default_params: + args.num_train_epoch = default_params[task_name]["num_train_epochs"] + + if task_name not in processors: + raise ValueError("Task not found: %s" % task_name) + + processor = processors[task_name]() + output_mode = output_modes[task_name] + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=args.do_lower_case) + + if not args.do_eval: + if not args.aug_train: + train_examples = processor.get_train_examples(args.data_dir) + else: + train_examples = processor.get_aug_examples(args.data_dir) + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) + + args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps + + num_train_optimization_steps = int( + len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs + + train_features = convert_examples_to_features(train_examples, label_list, + args.max_seq_length, tokenizer, output_mode) + train_data, _ = get_tensor_data(output_mode, train_features) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + + eval_examples = processor.get_dev_examples(args.data_dir) + eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) + eval_data, eval_labels = get_tensor_data(output_mode, eval_features) + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) + + model = TinyBertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels) + model.to(device) + + if args.do_eval: + logger.info("***** Running evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + result = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + else: + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_examples)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_optimization_steps) + if n_gpu > 1: + model = torch.nn.DataParallel(model) + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + size = 0 + for n, p in model.named_parameters(): + logger.info('n: {}'.format(n)) + size += p.nelement() + + logger.info('Total parameters: {}'.format(size)) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + schedule = 'warmup_linear' + if not args.pred_distill: + schedule = 'none' + optimizer = BertAdam(optimizer_grouped_parameters, + schedule=schedule, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) + + # Train and evaluate + global_step = 0 + best_dev_acc = 0.0 + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + + for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"): + tr_loss = 0. + tr_cls_loss = 0. + + model.train() + nb_tr_examples, nb_tr_steps = 0, 0 + + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", ascii=True)): + batch = tuple(t.to(device) for t in batch) + + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch + if input_ids.size()[0] != args.train_batch_size: + continue + + cls_loss = 0. + + logits, _, _ = model(input_ids, segment_ids, input_mask) + + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + cls_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_mse = MSELoss() + cls_loss = loss_mse(logits.view(-1), label_ids.view(-1)) + + loss = cls_loss + tr_cls_loss += cls_loss.item() + + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + loss.backward() + + tr_loss += loss.item() + nb_tr_examples += label_ids.size(0) + nb_tr_steps += 1 + + if (step + 1) % args.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + if (global_step + 1) % args.eval_step == 0: + logger.info("***** Running evaluation *****") + logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + + loss = tr_loss / (step + 1) + cls_loss = tr_cls_loss / (step + 1) + + result = {} + if args.pred_distill: + result = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + result['global_step'] = global_step + result['cls_loss'] = cls_loss + result['loss'] = loss + + result_to_file(result, output_eval_file) + + if not args.pred_distill: + save_model = True + else: + save_model = False + + if task_name in acc_tasks and result['acc'] > best_dev_acc: + best_dev_acc = result['acc'] + save_model = True + + if task_name in corr_tasks and result['corr'] > best_dev_acc: + best_dev_acc = result['corr'] + save_model = True + + if task_name in mcc_tasks and result['mcc'] > best_dev_acc: + best_dev_acc = result['mcc'] + save_model = True + + if save_model: + logger.info("***** Save model *****") + + model_to_save = model.module if hasattr(model, 'module') else model + + model_name = WEIGHTS_NAME + # if not args.pred_distill: + # model_name = "step_{}_{}".format(global_step, WEIGHTS_NAME) + output_model_file = os.path.join(args.output_dir, model_name) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(args.output_dir) + + # Test mnli-mm + if args.pred_distill and task_name == "mnli": + task_name = "mnli-mm" + processor = processors[task_name]() + if not os.path.exists(args.output_dir + '-MM'): + os.makedirs(args.output_dir + '-MM') + + eval_examples = processor.get_dev_examples(args.data_dir) + + eval_features = convert_examples_to_features( + eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) + eval_data, eval_labels = get_tensor_data(output_mode, eval_features) + + logger.info("***** Running mm evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, + batch_size=args.eval_batch_size) + + result = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + + result['global_step'] = global_step + + tmp_output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") + result_to_file(result, tmp_output_eval_file) + + task_name = 'mnli' + + if oncloud: + logging.info(mox.file.list_directory(args.output_dir, recursive=True)) + logging.info(mox.file.list_directory('.', recursive=True)) + mox.file.copy_parallel(args.output_dir, args.data_url) + mox.file.copy_parallel('.', args.data_url) + + model.train() + + +if __name__ == "__main__": + main() diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py index 16905a31..39628a38 100644 --- a/TinyBERT/task_distill.py +++ b/TinyBERT/task_distill.py @@ -29,14 +29,13 @@ import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler) from tqdm import tqdm, trange from torch.nn import CrossEntropyLoss, MSELoss -from scipy.stats import pearsonr, spearmanr -from sklearn.metrics import matthews_corrcoef, f1_score +from data_processing import convert_examples_to_features, \ + compute_metrics, get_tensor_data, processors, output_modes from transformer.modeling import TinyBertForSequenceClassification from transformer.tokenization import BertTokenizer from transformer.optimization import BertAdam @@ -59,543 +58,6 @@ oncloud = False -class InputExample(object): - """A single training/test example for simple sequence classification.""" - - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. - - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.seq_length = seq_length - self.label_id = label_id - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding="utf-8") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - if sys.version_info[0] == 2: - line = list(unicode(cell, 'utf-8') for cell in line) - lines.append(line) - return lines - - -class MrpcProcessor(DataProcessor): - """Processor for the MRPC data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[3] - text_b = line[4] - label = line[0] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), - "dev_matched") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[8] - text_b = line[9] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MnliMismatchedProcessor(MnliProcessor): - """Processor for the MultiNLI Mismatched data set (GLUE version).""" - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), - "dev_matched") - - -class ColaProcessor(DataProcessor): - """Processor for the CoLA data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s" % (set_type, i) - text_a = line[3] - label = line[1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class Sst2Processor(DataProcessor): - """Processor for the SST-2 data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[0] - label = line[1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class StsbProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return [None] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[7] - text_b = line[8] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class QqpProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - try: - text_a = line[3] - text_b = line[4] - label = line[5] - except IndexError: - continue - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class QnliProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), - "dev_matched") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class RteProcessor(DataProcessor): - """Processor for the RTE data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_aug_examples(self, data_dir): - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class WnliProcessor(DataProcessor): - """Processor for the WNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -def convert_examples_to_features(examples, label_list, max_seq_length, - tokenizer, output_mode): - """Loads a data file into a list of `InputBatch`s.""" - - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for (ex_index, example) in enumerate(examples): - if ex_index % 10000 == 0: - logger.info("Writing example %d of %d" % (ex_index, len(examples))) - - tokens_a = tokenizer.tokenize(example.text_a) - - tokens_b = None - if example.text_b: - tokens_b = tokenizer.tokenize(example.text_b) - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[:(max_seq_length - 2)] - - tokens = ["[CLS]"] + tokens_a + ["[SEP]"] - segment_ids = [0] * len(tokens) - - if tokens_b: - tokens += tokens_b + ["[SEP]"] - segment_ids += [1] * (len(tokens_b) + 1) - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - input_mask = [1] * len(input_ids) - seq_length = len(input_ids) - - padding = [0] * (max_seq_length - len(input_ids)) - input_ids += padding - input_mask += padding - segment_ids += padding - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - if output_mode == "classification": - label_id = label_map[example.label] - elif output_mode == "regression": - label_id = float(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 1: - logger.info("*** Example ***") - logger.info("guid: %s" % (example.guid)) - logger.info("tokens: %s" % " ".join( - [str(x) for x in tokens])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - logger.info("label: {}".format(example.label)) - logger.info("label_id: {}".format(label_id)) - - features.append( - InputFeatures(input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - label_id=label_id, - seq_length=seq_length)) - return features - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -def simple_accuracy(preds, labels): - return (preds == labels).mean() - - -def acc_and_f1(preds, labels): - acc = simple_accuracy(preds, labels) - f1 = f1_score(y_true=labels, y_pred=preds) - return { - "acc": acc, - "f1": f1, - "acc_and_f1": (acc + f1) / 2, - } - - -def pearson_and_spearman(preds, labels): - pearson_corr = pearsonr(preds, labels)[0] - spearman_corr = spearmanr(preds, labels)[0] - return { - "pearson": pearson_corr, - "spearmanr": spearman_corr, - "corr": (pearson_corr + spearman_corr) / 2, - } - - -def compute_metrics(task_name, preds, labels): - assert len(preds) == len(labels) - if task_name == "cola": - return {"mcc": matthews_corrcoef(labels, preds)} - elif task_name == "sst-2": - return {"acc": simple_accuracy(preds, labels)} - elif task_name == "mrpc": - return acc_and_f1(preds, labels) - elif task_name == "sts-b": - return pearson_and_spearman(preds, labels) - elif task_name == "qqp": - return acc_and_f1(preds, labels) - elif task_name == "mnli": - return {"acc": simple_accuracy(preds, labels)} - elif task_name == "mnli-mm": - return {"acc": simple_accuracy(preds, labels)} - elif task_name == "qnli": - return {"acc": simple_accuracy(preds, labels)} - elif task_name == "rte": - return {"acc": simple_accuracy(preds, labels)} - elif task_name == "wnli": - return {"acc": simple_accuracy(preds, labels)} - else: - raise KeyError(task_name) - - -def get_tensor_data(output_mode, features): - if output_mode == "classification": - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) - elif output_mode == "regression": - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) - - all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) - tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_label_ids, all_seq_lengths) - return tensor_data, all_label_ids - - def result_to_file(result, file_name): with open(file_name, "a") as writer: logger.info("***** Eval results *****") @@ -744,31 +206,6 @@ def main(): args = parser.parse_args() logger.info('The args: {}'.format(args)) - processors = { - "cola": ColaProcessor, - "mnli": MnliProcessor, - "mnli-mm": MnliMismatchedProcessor, - "mrpc": MrpcProcessor, - "sst-2": Sst2Processor, - "sts-b": StsbProcessor, - "qqp": QqpProcessor, - "qnli": QnliProcessor, - "rte": RteProcessor, - "wnli": WnliProcessor - } - - output_modes = { - "cola": "classification", - "mnli": "classification", - "mrpc": "classification", - "sst-2": "classification", - "sts-b": "regression", - "qqp": "classification", - "qnli": "classification", - "rte": "classification", - "wnli": "classification" - } - # intermediate distillation default parameters default_params = { "cola": {"num_train_epochs": 50, "max_seq_length": 64}, From 906c120f51ec0d1a47617db6cc607029148bfd42 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 15 Sep 2021 09:16:59 +0200 Subject: [PATCH 07/62] Correct fine-tuning --- TinyBERT/fine_tune_bert.py | 57 ++++++++++++++++++-------------------- TinyBERT/task_distill.py | 4 +++ 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py index b129c982..4021bca9 100644 --- a/TinyBERT/fine_tune_bert.py +++ b/TinyBERT/fine_tune_bert.py @@ -194,14 +194,14 @@ def main(): # intermediate distillation default parameters default_params = { - "cola": {"num_train_epochs": 50, "max_seq_length": 64}, - "mnli": {"num_train_epochs": 5, "max_seq_length": 128}, - "mrpc": {"num_train_epochs": 20, "max_seq_length": 128}, - "sst-2": {"num_train_epochs": 10, "max_seq_length": 64}, - "sts-b": {"num_train_epochs": 20, "max_seq_length": 128}, - "qqp": {"num_train_epochs": 5, "max_seq_length": 128}, - "qnli": {"num_train_epochs": 10, "max_seq_length": 128}, - "rte": {"num_train_epochs": 20, "max_seq_length": 128} + "cola": {"num_train_epochs": 3, "max_seq_length": 64}, + "mnli": {"num_train_epochs": 3, "max_seq_length": 128}, + "mrpc": {"num_train_epochs": 3, "max_seq_length": 128}, + "sst-2": {"num_train_epochs": 3, "max_seq_length": 64}, + "sts-b": {"num_train_epochs": 3, "max_seq_length": 128}, + "qqp": {"num_train_epochs": 3, "max_seq_length": 128}, + "qnli": {"num_train_epochs": 3, "max_seq_length": 128}, + "rte": {"num_train_epochs": 3, "max_seq_length": 128} } acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] @@ -311,8 +311,7 @@ def main(): {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] schedule = 'warmup_linear' - if not args.pred_distill: - schedule = 'none' + optimizer = BertAdam(optimizer_grouped_parameters, schedule=schedule, lr=args.learning_rate, @@ -379,32 +378,31 @@ def main(): loss = tr_loss / (step + 1) cls_loss = tr_cls_loss / (step + 1) - result = {} - if args.pred_distill: - result = do_eval(model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) + result = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) result['global_step'] = global_step result['cls_loss'] = cls_loss result['loss'] = loss + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + result_to_file(result, output_eval_file) - if not args.pred_distill: - save_model = True - else: - save_model = False + save_model = False - if task_name in acc_tasks and result['acc'] > best_dev_acc: - best_dev_acc = result['acc'] - save_model = True + if task_name in acc_tasks and result['acc'] > best_dev_acc: + best_dev_acc = result['acc'] + save_model = True - if task_name in corr_tasks and result['corr'] > best_dev_acc: - best_dev_acc = result['corr'] - save_model = True + if task_name in corr_tasks and result['corr'] > best_dev_acc: + best_dev_acc = result['corr'] + save_model = True - if task_name in mcc_tasks and result['mcc'] > best_dev_acc: - best_dev_acc = result['mcc'] - save_model = True + if task_name in mcc_tasks and result['mcc'] > best_dev_acc: + best_dev_acc = result['mcc'] + save_model = True if save_model: logger.info("***** Save model *****") @@ -412,8 +410,7 @@ def main(): model_to_save = model.module if hasattr(model, 'module') else model model_name = WEIGHTS_NAME - # if not args.pred_distill: - # model_name = "step_{}_{}".format(global_step, WEIGHTS_NAME) + output_model_file = os.path.join(args.output_dir, model_name) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) @@ -422,7 +419,7 @@ def main(): tokenizer.save_vocabulary(args.output_dir) # Test mnli-mm - if args.pred_distill and task_name == "mnli": + if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if not os.path.exists(args.output_dir + '-MM'): diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py index 39628a38..b130d80a 100644 --- a/TinyBERT/task_distill.py +++ b/TinyBERT/task_distill.py @@ -451,6 +451,10 @@ def soft_cross_entropy(predicts, targets): result['rep_loss'] = rep_loss result['loss'] = loss + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + result_to_file(result, output_eval_file) if not args.pred_distill: From 3dd28f91a8a0af39cd62db87dcdcb03c02200b16 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 15 Sep 2021 09:41:08 +0200 Subject: [PATCH 08/62] Correct logging --- TinyBERT/data_processing.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/TinyBERT/data_processing.py b/TinyBERT/data_processing.py index 497dc65c..5ab7c242 100644 --- a/TinyBERT/data_processing.py +++ b/TinyBERT/data_processing.py @@ -1,4 +1,5 @@ import csv +import logging import os import sys @@ -7,7 +8,10 @@ from sklearn.metrics import f1_score, matthews_corrcoef from torch.utils.data import TensorDataset -from task_distill import logger +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) class InputExample(object): From 40350397b2e5ace351077c70619812967894d5b1 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 15 Sep 2021 09:42:23 +0200 Subject: [PATCH 09/62] Add minor correct --- TinyBERT/fine_tune_bert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py index 4021bca9..837a60bf 100644 --- a/TinyBERT/fine_tune_bert.py +++ b/TinyBERT/fine_tune_bert.py @@ -248,7 +248,7 @@ def main(): label_list = processor.get_labels() num_labels = len(label_list) - tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=args.do_lower_case) + tokenizer = BertTokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case) if not args.do_eval: if not args.aug_train: From c605dbf53d3d82dbdcafa281a967ae7ee073f307 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 15 Sep 2021 09:58:27 +0200 Subject: [PATCH 10/62] Add minor corrects --- TinyBERT/fine_tune_bert.py | 6 +----- TinyBERT/task_distill.py | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py index 837a60bf..10adafa6 100644 --- a/TinyBERT/fine_tune_bert.py +++ b/TinyBERT/fine_tune_bert.py @@ -367,7 +367,7 @@ def main(): optimizer.zero_grad() global_step += 1 - if (global_step + 1) % args.eval_step == 0: + if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == num_train_optimization_steps: logger.info("***** Running evaluation *****") logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) logger.info(" Num examples = %d", len(eval_examples)) @@ -384,10 +384,6 @@ def main(): result['cls_loss'] = cls_loss result['loss'] = loss - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - result_to_file(result, output_eval_file) save_model = False diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py index b130d80a..39628a38 100644 --- a/TinyBERT/task_distill.py +++ b/TinyBERT/task_distill.py @@ -451,10 +451,6 @@ def soft_cross_entropy(predicts, targets): result['rep_loss'] = rep_loss result['loss'] = loss - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - result_to_file(result, output_eval_file) if not args.pred_distill: From efd7a7d01104154e2385107fecb801129f40dfd4 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 15 Sep 2021 10:00:33 +0200 Subject: [PATCH 11/62] Add minor corrects --- TinyBERT/fine_tune_bert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py index 10adafa6..b511e429 100644 --- a/TinyBERT/fine_tune_bert.py +++ b/TinyBERT/fine_tune_bert.py @@ -367,7 +367,7 @@ def main(): optimizer.zero_grad() global_step += 1 - if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == num_train_optimization_steps: + if (global_step + 1) % args.eval_step == 0 or (global_step + 1) in [1, num_train_optimization_steps]: logger.info("***** Running evaluation *****") logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) logger.info(" Num examples = %d", len(eval_examples)) From 065a08210986b3f518cd57cb789a7d481561c37f Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 15 Sep 2021 10:08:34 +0200 Subject: [PATCH 12/62] Add minor corrects --- TinyBERT/fine_tune_bert.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py index b511e429..9dd52d92 100644 --- a/TinyBERT/fine_tune_bert.py +++ b/TinyBERT/fine_tune_bert.py @@ -194,14 +194,14 @@ def main(): # intermediate distillation default parameters default_params = { - "cola": {"num_train_epochs": 3, "max_seq_length": 64}, - "mnli": {"num_train_epochs": 3, "max_seq_length": 128}, - "mrpc": {"num_train_epochs": 3, "max_seq_length": 128}, - "sst-2": {"num_train_epochs": 3, "max_seq_length": 64}, - "sts-b": {"num_train_epochs": 3, "max_seq_length": 128}, - "qqp": {"num_train_epochs": 3, "max_seq_length": 128}, - "qnli": {"num_train_epochs": 3, "max_seq_length": 128}, - "rte": {"num_train_epochs": 3, "max_seq_length": 128} + "cola": {"num_train_epochs": 5, "max_seq_length": 64}, + "mnli": {"num_train_epochs": 5, "max_seq_length": 128}, + "mrpc": {"num_train_epochs": 5, "max_seq_length": 128}, + "sst-2": {"num_train_epochs": 5, "max_seq_length": 64}, + "sts-b": {"num_train_epochs": 5, "max_seq_length": 128}, + "qqp": {"num_train_epochs": 5, "max_seq_length": 128}, + "qnli": {"num_train_epochs": 5, "max_seq_length": 128}, + "rte": {"num_train_epochs": 5, "max_seq_length": 128} } acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] @@ -367,7 +367,8 @@ def main(): optimizer.zero_grad() global_step += 1 - if (global_step + 1) % args.eval_step == 0 or (global_step + 1) in [1, num_train_optimization_steps]: + if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 1 or \ + (global_step + 1) == num_train_optimization_steps: logger.info("***** Running evaluation *****") logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) logger.info(" Num examples = %d", len(eval_examples)) From e68c110649ab4283ce80a37bb4a9bb8a66e39526 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 15 Sep 2021 10:11:51 +0200 Subject: [PATCH 13/62] Correct saving results --- TinyBERT/fine_tune_bert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py index 9dd52d92..f90fd5e3 100644 --- a/TinyBERT/fine_tune_bert.py +++ b/TinyBERT/fine_tune_bert.py @@ -60,6 +60,7 @@ def result_to_file(result, file_name): with open(file_name, "a") as writer: + writer.write("") logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) From 2c69c978b7e16c67110498d114f830da2648eca7 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 15 Sep 2021 10:28:31 +0200 Subject: [PATCH 14/62] Add small correction --- TinyBERT/fine_tune_bert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py index f90fd5e3..4b7a9881 100644 --- a/TinyBERT/fine_tune_bert.py +++ b/TinyBERT/fine_tune_bert.py @@ -368,7 +368,7 @@ def main(): optimizer.zero_grad() global_step += 1 - if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 1 or \ + if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 0 or \ (global_step + 1) == num_train_optimization_steps: logger.info("***** Running evaluation *****") logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) From b12e2e053a2c63611721ae0ecf3c42b16a383199 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 15 Sep 2021 10:44:55 +0200 Subject: [PATCH 15/62] Add small correction for task_distill.py --- TinyBERT/task_distill.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py index 39628a38..5c2f3fa1 100644 --- a/TinyBERT/task_distill.py +++ b/TinyBERT/task_distill.py @@ -61,6 +61,7 @@ def result_to_file(result, file_name): with open(file_name, "a") as writer: logger.info("***** Eval results *****") + writer.write("") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) @@ -428,7 +429,8 @@ def soft_cross_entropy(predicts, targets): optimizer.zero_grad() global_step += 1 - if (global_step + 1) % args.eval_step == 0: + if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 0 or \ + (global_step + 1) == num_train_optimization_steps: logger.info("***** Running evaluation *****") logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) logger.info(" Num examples = %d", len(eval_examples)) From 008b180bfb25c73554ecd89563a8be829e5f4fa0 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 15 Sep 2021 12:14:21 +0200 Subject: [PATCH 16/62] Add minor corrects --- TinyBERT/fine_tune_bert.py | 16 ++++++++-------- TinyBERT/task_distill.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py index 4b7a9881..e6707fcb 100644 --- a/TinyBERT/fine_tune_bert.py +++ b/TinyBERT/fine_tune_bert.py @@ -195,13 +195,13 @@ def main(): # intermediate distillation default parameters default_params = { - "cola": {"num_train_epochs": 5, "max_seq_length": 64}, - "mnli": {"num_train_epochs": 5, "max_seq_length": 128}, - "mrpc": {"num_train_epochs": 5, "max_seq_length": 128}, - "sst-2": {"num_train_epochs": 5, "max_seq_length": 64}, - "sts-b": {"num_train_epochs": 5, "max_seq_length": 128}, - "qqp": {"num_train_epochs": 5, "max_seq_length": 128}, - "qnli": {"num_train_epochs": 5, "max_seq_length": 128}, + "cola": {"num_train_epochs": 3, "max_seq_length": 64}, + "mnli": {"num_train_epochs": 3, "max_seq_length": 128}, + "mrpc": {"num_train_epochs": 3, "max_seq_length": 128}, + "sst-2": {"num_train_epochs":3, "max_seq_length": 64}, + "sts-b": {"num_train_epochs": 3, "max_seq_length": 128}, + "qqp": {"num_train_epochs": 3, "max_seq_length": 128}, + "qnli": {"num_train_epochs": 3, "max_seq_length": 128}, "rte": {"num_train_epochs": 5, "max_seq_length": 128} } @@ -368,7 +368,7 @@ def main(): optimizer.zero_grad() global_step += 1 - if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 0 or \ + if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 2 or \ (global_step + 1) == num_train_optimization_steps: logger.info("***** Running evaluation *****") logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py index 5c2f3fa1..2e6b9b6b 100644 --- a/TinyBERT/task_distill.py +++ b/TinyBERT/task_distill.py @@ -429,7 +429,7 @@ def soft_cross_entropy(predicts, targets): optimizer.zero_grad() global_step += 1 - if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 0 or \ + if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 2 or \ (global_step + 1) == num_train_optimization_steps: logger.info("***** Running evaluation *****") logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) From 0996bde9961ce793736fe4bb18483ead77ae010f Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 21 Sep 2021 11:52:00 +0200 Subject: [PATCH 17/62] feat: add script counting number of parameters and MACs of model --- TinyBERT/model_statistics.py | 90 ++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 TinyBERT/model_statistics.py diff --git a/TinyBERT/model_statistics.py b/TinyBERT/model_statistics.py new file mode 100644 index 00000000..75f3884f --- /dev/null +++ b/TinyBERT/model_statistics.py @@ -0,0 +1,90 @@ +from __future__ import absolute_import, division, print_function + +import argparse +import logging +import math +import sys + +import torch +from thop import profile + +from data_processing import processors +from transformer.modeling import TinyBertForSequenceClassification +from transformer.tokenization import BertTokenizer + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +fh = logging.FileHandler('debug_layer_loss.log') +fh.setFormatter(logging.Formatter(log_format)) +logging.getLogger().addHandler(fh) +logger = logging.getLogger() + + +def print_results(macs, params, title=''): + if len(title) != 0: + print("- " + title) + print(f"\tmacs [G]: {macs / math.pow(10, 9):.2f}, params [M]: {params / math.pow(10, 6):.2f}") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", + default=None, + type=str, + help="The anlised model dir.") + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + + args = parser.parse_args() + logger.info('The args: {}'.format(args)) + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) + + logger.info("device: {} n_gpu: {}".format(device, n_gpu)) + + task_name = args.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % task_name) + + processor = processors[task_name]() + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) + + model = TinyBertForSequenceClassification.from_pretrained(args.model, num_labels=num_labels) + model.to(device) + + model_input = tuple([torch.randint(high=len(tokenizer.vocab), + size=(1, args.max_seq_length), dtype=torch.int64, device=device), + torch.randint(high=1, size=(1, args.max_seq_length), dtype=torch.int64, device=device), + torch.randint(high=1, size=(1, args.max_seq_length), dtype=torch.int64, device=device)]) + + macs, params = profile(model, inputs=model_input) + + print("Results") + print_results(macs, params) + + +if __name__ == "__main__": + main() From b6c971664fd66e30eeca05772b62c7bc0a4c28e4 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 21 Sep 2021 11:59:38 +0200 Subject: [PATCH 18/62] feat: update requirements.txt --- TinyBERT/requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TinyBERT/requirements.txt b/TinyBERT/requirements.txt index 5f5389e8..2a155cdd 100644 --- a/TinyBERT/requirements.txt +++ b/TinyBERT/requirements.txt @@ -7,4 +7,6 @@ requests torch>=1.0.1 scipy>=0.14.0 -seaborn \ No newline at end of file +seaborn + +thop From 4b93a392a8e4b01340a7122c61116b45bdcb6cbd Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 21 Sep 2021 12:01:35 +0200 Subject: [PATCH 19/62] fix: add no_cuda argument --- TinyBERT/model_statistics.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/TinyBERT/model_statistics.py b/TinyBERT/model_statistics.py index 75f3884f..a26b6252 100644 --- a/TinyBERT/model_statistics.py +++ b/TinyBERT/model_statistics.py @@ -47,6 +47,9 @@ def main(): parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") args = parser.parse_args() logger.info('The args: {}'.format(args)) From 222d8f1a89647792a97e1f4bd408a38e303bc5cf Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 14:06:24 +0100 Subject: [PATCH 20/62] feat: small correct in logging during finetuning --- TinyBERT/fine_tune_bert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py index e6707fcb..4a848775 100644 --- a/TinyBERT/fine_tune_bert.py +++ b/TinyBERT/fine_tune_bert.py @@ -63,7 +63,7 @@ def result_to_file(result, file_name): writer.write("") logger.info("***** Eval results *****") for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) + logger.info("%s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) From 7f3a20d420ff45be3b977fb5988a08600d519fb0 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 16:25:56 +0100 Subject: [PATCH 21/62] feat: add scripts for multiemo --- TernaryBERT/download_bert_base.py | 31 ++ TernaryBERT/fine_tune_bert.py | 427 ++++++++++++++++++++++++ TernaryBERT/quant_task_glue.py | 167 ++++----- TernaryBERT/quant_task_polemo.py | 422 +++++++++++++++++++++++ TernaryBERT/scripts/download_dataset.py | 54 +++ TernaryBERT/utils_glue.py | 28 +- TernaryBERT/utils_multiemo.py | 226 +++++++++++++ 7 files changed, 1262 insertions(+), 93 deletions(-) create mode 100644 TernaryBERT/download_bert_base.py create mode 100644 TernaryBERT/fine_tune_bert.py create mode 100644 TernaryBERT/quant_task_polemo.py create mode 100644 TernaryBERT/scripts/download_dataset.py create mode 100644 TernaryBERT/utils_multiemo.py diff --git a/TernaryBERT/download_bert_base.py b/TernaryBERT/download_bert_base.py new file mode 100644 index 00000000..fa99e41a --- /dev/null +++ b/TernaryBERT/download_bert_base.py @@ -0,0 +1,31 @@ +import os +import requests +import tarfile + +url = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz' + +output_path = os.path.join('data', 'models') +os.makedirs(output_path, exist_ok=True) + +output_tar = os.path.join(output_path, 'bert-base-uncased.tar.gz') +model_folder = os.path.join(output_path, 'bert-base-uncased') + +response = requests.get(url, stream=True) +if response.status_code == 200: + with open(output_tar, 'wb') as f: + f.write(response.raw.read()) + +with tarfile.open(name=output_tar, mode="r|gz") as tar_ref: + tar_ref.extractall(model_folder) + +os.rename(os.path.join(model_folder, 'bert_config.json'), os.path.join(model_folder, 'config.json')) + +os.remove(output_tar) + +url_vocab = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt' +r = requests.get(url_vocab) + +with open(os.path.join(model_folder, 'vocab.txt'), 'wb') as f: + f.write(r.content) + +print('Completed!') diff --git a/TernaryBERT/fine_tune_bert.py b/TernaryBERT/fine_tune_bert.py new file mode 100644 index 00000000..e2c1d1ee --- /dev/null +++ b/TernaryBERT/fine_tune_bert.py @@ -0,0 +1,427 @@ +# coding=utf-8 +# 2019.12.2-Changed for TinyBERT task-specific distillation +# Huawei Technologies Co., Ltd. +# Copyright 2020 Huawei Technologies Co., Ltd. +# Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import csv +import logging +import os +import random +import sys + +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler) +from tqdm import tqdm, trange + +from torch.nn import CrossEntropyLoss, MSELoss + +from utils_multiemo import * +from transformer.modeling import BertForSequenceClassification +from transformer.tokenization import BertTokenizer +from transformer.optimization import BertAdam +from transformer.file_utils import WEIGHTS_NAME, CONFIG_NAME + +csv.field_size_limit(sys.maxsize) + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +fh = logging.FileHandler('debug_layer_loss.log') +fh.setFormatter(logging.Formatter(log_format)) +logging.getLogger().addHandler(fh) +logger = logging.getLogger() + + +def result_to_file(result, file_name): + with open(file_name, "a") as writer: + writer.write("") + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info("%s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths) + return tensor_data, all_label_ids + + +def do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels): + eval_loss = 0 + nb_eval_steps = 0 + preds = [] + + for batch_ in tqdm(eval_dataloader, desc="Evaluating"): + batch_ = tuple(t.to(device) for t in batch_) + with torch.no_grad(): + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ + + logits, _, _ = model(input_ids, segment_ids, input_mask) + + # create eval loss and other metric required by the task + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + if len(preds) == 0: + preds.append(logits.detach().cpu().numpy()) + else: + preds[0] = np.append( + preds[0], logits.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + + preds = preds[0] + if output_mode == "classification": + preds = np.argmax(preds, axis=1) + elif output_mode == "regression": + preds = np.squeeze(preds) + result = compute_metrics(task_name, preds, eval_labels.numpy()) + result['eval_loss'] = eval_loss + + return result + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--pretrained_model", + default=None, + type=str, + help="The pretrained model dir.") + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=32, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument('--weight_decay', '--wd', + default=1e-4, + type=float, + metavar='W', + help='weight decay') + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + + # added arguments + parser.add_argument('--aug_train', + action='store_true') + parser.add_argument('--eval_step', + type=int, + default=50) + + args = parser.parse_args() + logger.info('The args: {}'.format(args)) + + # intermediate distillation default parameters + default_params = { + "multiemo": {"num_train_epochs": 3, "max_seq_length": 128}, + "cola": {"num_train_epochs": 3, "max_seq_length": 64}, + "mnli": {"num_train_epochs": 3, "max_seq_length": 128}, + "mrpc": {"num_train_epochs": 3, "max_seq_length": 128}, + "sst-2": {"num_train_epochs": 3, "max_seq_length": 64}, + "sts-b": {"num_train_epochs": 3, "max_seq_length": 128}, + "qqp": {"num_train_epochs": 3, "max_seq_length": 128}, + "qnli": {"num_train_epochs": 3, "max_seq_length": 128}, + "rte": {"num_train_epochs": 5, "max_seq_length": 128} + } + + acc_tasks = ["multiemo", "mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] + corr_tasks = ["sts-b"] + mcc_tasks = ["cola"] + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) + + logger.info("device: {} n_gpu: {}".format(device, n_gpu)) + + # Prepare seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + # Prepare task settings + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + task_name = args.task_name.lower() + + if task_name in default_params: + args.max_seq_len = default_params[task_name]["max_seq_length"] + + if not args.do_eval: + if task_name in default_params: + args.num_train_epoch = default_params[task_name]["num_train_epochs"] + + if task_name not in processors: + raise ValueError("Task not found: %s" % task_name) + + processor = processors[task_name]() + output_mode = output_modes[task_name] + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case) + + if not args.do_eval: + if not args.aug_train: + train_examples = processor.get_train_examples(args.data_dir) + else: + train_examples = processor.get_aug_examples(args.data_dir) + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) + + args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps + + num_train_optimization_steps = int( + len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs + + train_features = convert_examples_to_features(train_examples, label_list, + args.max_seq_length, tokenizer, output_mode) + train_data, _ = get_tensor_data(output_mode, train_features) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + + eval_examples = processor.get_dev_examples(args.data_dir) + eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) + eval_data, eval_labels = get_tensor_data(output_mode, eval_features) + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) + + model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels) + model.to(device) + + if args.do_eval: + logger.info("***** Running evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + result = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + else: + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_examples)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_optimization_steps) + if n_gpu > 1: + model = torch.nn.DataParallel(model) + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + size = 0 + for n, p in model.named_parameters(): + logger.info('n: {}'.format(n)) + size += p.nelement() + + logger.info('Total parameters: {}'.format(size)) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + schedule = 'warmup_linear' + + optimizer = BertAdam(optimizer_grouped_parameters, + schedule=schedule, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) + + # Train and evaluate + global_step = 0 + best_dev_acc = 0.0 + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + + for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"): + tr_loss = 0. + tr_cls_loss = 0. + + model.train() + nb_tr_examples, nb_tr_steps = 0, 0 + + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", ascii=True)): + batch = tuple(t.to(device) for t in batch) + + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch + if input_ids.size()[0] != args.train_batch_size: + continue + + cls_loss = 0. + + logits, _, _ = model(input_ids, segment_ids, input_mask) + + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + cls_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_mse = MSELoss() + cls_loss = loss_mse(logits.view(-1), label_ids.view(-1)) + + loss = cls_loss + tr_cls_loss += cls_loss.item() + + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + loss.backward() + + tr_loss += loss.item() + nb_tr_examples += label_ids.size(0) + nb_tr_steps += 1 + + if (step + 1) % args.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 2 or \ + (global_step + 1) == num_train_optimization_steps: + logger.info("***** Running evaluation *****") + logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + + loss = tr_loss / (step + 1) + cls_loss = tr_cls_loss / (step + 1) + + result = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + result['global_step'] = global_step + result['cls_loss'] = cls_loss + result['loss'] = loss + + result_to_file(result, output_eval_file) + + save_model = False + + if task_name in acc_tasks and result['acc'] > best_dev_acc: + best_dev_acc = result['acc'] + save_model = True + + if task_name in corr_tasks and result['corr'] > best_dev_acc: + best_dev_acc = result['corr'] + save_model = True + + if task_name in mcc_tasks and result['mcc'] > best_dev_acc: + best_dev_acc = result['mcc'] + save_model = True + + if save_model: + logger.info("***** Save model *****") + model_to_save = model.module if hasattr(model, 'module') else model + + model_name = WEIGHTS_NAME + + output_model_file = os.path.join(args.output_dir, model_name) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(args.output_dir) + + model.train() + + +if __name__ == "__main__": + main() diff --git a/TernaryBERT/quant_task_glue.py b/TernaryBERT/quant_task_glue.py index 1356da24..a4740e62 100644 --- a/TernaryBERT/quant_task_glue.py +++ b/TernaryBERT/quant_task_glue.py @@ -10,11 +10,11 @@ import numpy as np import torch -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.tensorboard import SummaryWriter from torch.nn import CrossEntropyLoss, MSELoss -from transformer import BertForSequenceClassification,WEIGHTS_NAME, CONFIG_NAME +from transformer import BertForSequenceClassification, WEIGHTS_NAME, CONFIG_NAME from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification from transformer import BertTokenizer from transformer import BertAdam @@ -26,6 +26,7 @@ format=log_format, datefmt='%m/%d %I:%M:%S %p') logger = logging.getLogger() + def get_tensor_data(output_mode, features): if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) @@ -36,16 +37,17 @@ def get_tensor_data(output_mode, features): all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) - tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_label_ids, all_seq_lengths) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths) return tensor_data, all_label_ids + def do_eval(model, task_name, eval_dataloader, device, output_mode, eval_labels, num_labels): eval_loss = 0 nb_eval_steps = 0 preds = [] - for _,batch_ in enumerate(eval_dataloader): + for _, batch_ in enumerate(eval_dataloader): batch_ = tuple(t.to(device) for t in batch_) with torch.no_grad(): input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ @@ -78,11 +80,13 @@ def do_eval(model, task_name, eval_dataloader, result['eval_loss'] = eval_loss return result + def soft_cross_entropy(predicts, targets): student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) targets_prob = torch.nn.functional.softmax(targets, dim=-1) return (- targets_prob * student_likelihood).mean() + def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", @@ -109,7 +113,7 @@ def main(): default='output', type=str, help="The output directory where the model predictions and checkpoints will be written.") - + parser.add_argument("--learning_rate", default=2e-5, type=float, @@ -122,7 +126,7 @@ def main(): type=int, default=42, help="random seed for initialization") - + parser.add_argument('--aug_train', action='store_false', help="Whether to use augmented data or not") @@ -142,7 +146,7 @@ def main(): parser.add_argument("--weight_bits", default=2, type=int, - choices=[2,8], + choices=[2, 8], help="Quantization bits for weight.") parser.add_argument("--input_bits", default=8, @@ -158,17 +162,17 @@ def main(): summaryWriter = SummaryWriter(args.output_dir) logger.info('The args: {}'.format(args)) task_name = args.task_name.lower() - data_dir = os.path.join(args.data_dir,task_name) - output_dir = os.path.join(args.output_dir,task_name) + data_dir = os.path.join(args.data_dir, task_name) + output_dir = os.path.join(args.output_dir, task_name) # processed_data_dir = os.path.join(args.data_dir,'preprocessed',task_name) if not os.path.exists(output_dir): os.mkdir(output_dir) - + if args.student_model is None: - args.student_model = os.path.join(args.model_dir,task_name) + args.student_model = os.path.join(args.model_dir, task_name) if args.teacher_model is None: - args.teacher_model = os.path.join(args.model_dir,task_name) + args.teacher_model = os.path.join(args.model_dir, task_name) processors = { "cola": ColaProcessor, @@ -194,14 +198,14 @@ def main(): } default_params = { - "cola": {"max_seq_length": 64,"batch_size":16,"eval_step":50}, - "mnli": {"max_seq_length": 128,"batch_size":32,"eval_step":1000}, - "mrpc": {"max_seq_length": 128,"batch_size":32,"eval_step":200}, - "sst-2": {"max_seq_length": 64,"batch_size":32,"eval_step":200}, - "sts-b": {"max_seq_length": 128,"batch_size":32,"eval_step":50}, - "qqp": {"max_seq_length": 128,"batch_size":32,"eval_step":1000}, - "qnli": {"max_seq_length": 128,"batch_size":32,"eval_step":1000}, - "rte": {"max_seq_length": 128,"batch_size":32,"eval_step":100} + "cola": {"max_seq_length": 64, "batch_size": 16, "eval_step": 50}, + "mnli": {"max_seq_length": 128, "batch_size": 32, "eval_step": 1000}, + "mrpc": {"max_seq_length": 128, "batch_size": 32, "eval_step": 200}, + "sst-2": {"max_seq_length": 64, "batch_size": 32, "eval_step": 200}, + "sts-b": {"max_seq_length": 128, "batch_size": 32, "eval_step": 50}, + "qqp": {"max_seq_length": 128, "batch_size": 32, "eval_step": 1000}, + "qnli": {"max_seq_length": 128, "batch_size": 32, "eval_step": 1000}, + "rte": {"max_seq_length": 128, "batch_size": 32, "eval_step": 100} } acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] @@ -218,11 +222,11 @@ def main(): torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) - + if task_name in default_params: args.batch_size = default_params[task_name]["batch_size"] if n_gpu > 0: - args.batch_size = int(args.batch_size*n_gpu) + args.batch_size = int(args.batch_size * n_gpu) args.max_seq_length = default_params[task_name]["max_seq_length"] args.eval_step = default_params[task_name]["eval_step"] @@ -232,35 +236,36 @@ def main(): num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=True) - + if args.aug_train: try: - train_file = os.path.join(processed_data_dir,'aug_data') - train_features = pickle.load(open(train_file,'rb')) + train_file = os.path.join(processed_data_dir, 'aug_data') + train_features = pickle.load(open(train_file, 'rb')) except: train_examples = processor.get_aug_examples(data_dir) train_features = convert_examples_to_features(train_examples, label_list, - args.max_seq_length, tokenizer, output_mode) + args.max_seq_length, tokenizer, output_mode) else: try: - train_file = os.path.join(processed_data_dir,'train_data') - train_features = pickle.load(open(train_file,'rb')) + train_file = os.path.join(processed_data_dir, 'train_data') + train_features = pickle.load(open(train_file, 'rb')) except: train_examples = processor.get_train_examples(data_dir) train_features = convert_examples_to_features(train_examples, label_list, - args.max_seq_length, tokenizer, output_mode) + args.max_seq_length, tokenizer, output_mode) num_train_optimization_steps = int(len(train_features) / args.batch_size) * args.num_train_epochs train_data, _ = get_tensor_data(output_mode, train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) - + try: - dev_file = train_file = os.path.join(processed_data_dir,'dev_data') - eval_features = pickle.load(open(dev_file,'rb')) + dev_file = train_file = os.path.join(processed_data_dir, 'dev_data') + eval_features = pickle.load(open(dev_file, 'rb')) except: eval_examples = processor.get_dev_examples(data_dir) - eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) + eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, + output_mode) eval_data, eval_labels = get_tensor_data(output_mode, eval_features) eval_sampler = SequentialSampler(eval_data) @@ -268,13 +273,13 @@ def main(): if task_name == "mnli": processor = processors["mnli-mm"]() try: - dev_mm_file = train_file = os.path.join(processed_data_dir,'dev-mm_data') - mm_eval_features = pickle.load(open(dev_mm_file,'rb')) + dev_mm_file = train_file = os.path.join(processed_data_dir, 'dev-mm_data') + mm_eval_features = pickle.load(open(dev_mm_file, 'rb')) except: mm_eval_examples = processor.get_dev_examples(data_dir) mm_eval_features = convert_examples_to_features( mm_eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) - + mm_eval_data, mm_eval_labels = get_tensor_data(output_mode, mm_eval_features) logger.info(" Num examples = %d", len(mm_eval_features)) @@ -289,11 +294,11 @@ def main(): teacher_model = torch.nn.DataParallel(teacher_model) result = do_eval(teacher_model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) + device, output_mode, eval_labels, num_labels) if task_name in acc_tasks: - if task_name in ['sst-2','mnli','qnli','rte']: + if task_name in ['sst-2', 'mnli', 'qnli', 'rte']: fp32_performance = f"acc:{result['acc']}" - elif task_name in ['mrpc','qqp']: + elif task_name in ['mrpc', 'qqp']: fp32_performance = f"f1/acc:{result['f1']}/{result['acc']}" if task_name in corr_tasks: fp32_performance = f"pearson/spearmanr:{result['pearson']}/{result['spearmanr']}" @@ -303,15 +308,16 @@ def main(): if task_name == "mnli": result = do_eval(teacher_model, 'mnli-mm', mm_eval_dataloader, - device, output_mode, mm_eval_labels, num_labels) + device, output_mode, mm_eval_labels, num_labels) fp32_performance += f" mm-acc:{result['acc']}" - fp32_performance = task_name +' fp32 ' + fp32_performance - student_config = BertConfig.from_pretrained(args.teacher_model, + fp32_performance = task_name + ' fp32 ' + fp32_performance + student_config = BertConfig.from_pretrained(args.teacher_model, quantize_act=True, - weight_bits = args.weight_bits, - input_bits = args.input_bits, - clip_val = args.clip_val) - student_model = QuantBertForSequenceClassification.from_pretrained(args.student_model, config = student_config, num_labels=num_labels) + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val) + student_model = QuantBertForSequenceClassification.from_pretrained(args.student_model, config=student_config, + num_labels=num_labels) student_model.to(device) logger.info("***** Running training *****") @@ -320,7 +326,7 @@ def main(): logger.info(" Num steps = %d", num_train_optimization_steps) if n_gpu > 1: student_model = torch.nn.DataParallel(student_model) - + # Prepare optimizer param_optimizer = list(student_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] @@ -330,15 +336,15 @@ def main(): ] schedule = 'warmup_linear' optimizer = BertAdam(optimizer_grouped_parameters, - schedule=schedule, - lr=args.learning_rate, - warmup=0.1, - t_total=num_train_optimization_steps) + schedule=schedule, + lr=args.learning_rate, + warmup=0.1, + t_total=num_train_optimization_steps) loss_mse = MSELoss() global_step = 0 best_dev_acc = 0.0 previous_best = None - + tr_loss = 0. tr_att_loss = 0. tr_rep_loss = 0. @@ -359,10 +365,10 @@ def main(): with torch.no_grad(): teacher_logits, teacher_atts, teacher_reps = teacher_model(input_ids, segment_ids, input_mask) - + if args.pred_distill: if output_mode == "classification": - cls_loss = soft_cross_entropy(student_logits,teacher_logits) + cls_loss = soft_cross_entropy(student_logits, teacher_logits) elif output_mode == "regression": cls_loss = loss_mse(student_logits, teacher_logits) @@ -372,9 +378,9 @@ def main(): if args.intermediate_distill: for student_att, teacher_att in zip(student_atts, teacher_atts): student_att = torch.where(student_att <= -1e2, torch.zeros_like(student_att).to(device), - student_att) + student_att) teacher_att = torch.where(teacher_att <= -1e2, torch.zeros_like(teacher_att).to(device), - teacher_att) + teacher_att) tmp_loss = loss_mse(student_att, teacher_att) att_loss += tmp_loss @@ -397,7 +403,7 @@ def main(): tr_loss += loss.item() nb_tr_examples += label_ids.size(0) nb_tr_steps += 1 - if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps-1: + if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps - 1: logger.info("***** Running evaluation *****") logger.info(" {} step of {} steps".format(global_step, num_train_optimization_steps)) if previous_best is not None: @@ -411,34 +417,34 @@ def main(): rep_loss = tr_rep_loss / (step + 1) result = do_eval(student_model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) + device, output_mode, eval_labels, num_labels) result['global_step'] = global_step result['cls_loss'] = cls_loss result['att_loss'] = att_loss result['rep_loss'] = rep_loss result['loss'] = loss - summaryWriter.add_scalar('total_loss',loss,global_step) - summaryWriter.add_scalars('distill_loss',{'att_loss':att_loss, - 'rep_loss':rep_loss, - 'cls_loss':cls_loss},global_step) - - if task_name=='cola': - summaryWriter.add_scalar('mcc',result['mcc'],global_step) - elif task_name in ['sst-2','mnli','mnli-mm','qnli','rte','wnli']: - summaryWriter.add_scalar('acc',result['acc'],global_step) - elif task_name in ['mrpc','qqp']: - summaryWriter.add_scalars('performance',{'acc':result['acc'], - 'f1':result['f1'], - 'acc_and_f1':result['acc_and_f1']},global_step) + summaryWriter.add_scalar('total_loss', loss, global_step) + summaryWriter.add_scalars('distill_loss', {'att_loss': att_loss, + 'rep_loss': rep_loss, + 'cls_loss': cls_loss}, global_step) + + if task_name == 'cola': + summaryWriter.add_scalar('mcc', result['mcc'], global_step) + elif task_name in ['sst-2', 'mnli', 'mnli-mm', 'qnli', 'rte', 'wnli']: + summaryWriter.add_scalar('acc', result['acc'], global_step) + elif task_name in ['mrpc', 'qqp']: + summaryWriter.add_scalars('performance', {'acc': result['acc'], + 'f1': result['f1'], + 'acc_and_f1': result['acc_and_f1']}, global_step) else: - summaryWriter.add_scalar('corr',result['corr'],global_step) + summaryWriter.add_scalar('corr', result['corr'], global_step) save_model = False if task_name in acc_tasks and result['acc'] > best_dev_acc: - if task_name in ['sst-2','mnli','qnli','rte']: + if task_name in ['sst-2', 'mnli', 'qnli', 'rte']: previous_best = f"acc:{result['acc']}" - elif task_name in ['mrpc','qqp']: + elif task_name in ['mrpc', 'qqp']: previous_best = f"f1/acc:{result['f1']}/{result['acc']}" best_dev_acc = result['acc'] save_model = True @@ -457,8 +463,8 @@ def main(): # Test mnli-mm if task_name == "mnli": result = do_eval(student_model, 'mnli-mm', mm_eval_dataloader, - device, output_mode, mm_eval_labels, num_labels) - previous_best+= f"mm-acc:{result['acc']}" + device, output_mode, mm_eval_labels, num_labels) + previous_best += f"mm-acc:{result['acc']}" logger.info(fp32_performance) logger.info(previous_best) if args.save_fp_model: @@ -478,10 +484,11 @@ def main(): model_to_save = student_model.module if hasattr(student_model, 'module') else student_model quant_model = copy.deepcopy(model_to_save) for name, module in quant_model.named_modules(): - if hasattr(module,'weight_quantizer'): - module.weight.data = module.weight_quantizer.apply(module.weight,module.weight_clip_val, - module.weight_bits,True) - + if hasattr(module, 'weight_quantizer'): + module.weight.data = module.weight_quantizer.apply(module.weight, + module.weight_clip_val, + module.weight_bits, True) + output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_quant_dir, CONFIG_NAME) diff --git a/TernaryBERT/quant_task_polemo.py b/TernaryBERT/quant_task_polemo.py new file mode 100644 index 00000000..ca07bfe3 --- /dev/null +++ b/TernaryBERT/quant_task_polemo.py @@ -0,0 +1,422 @@ +from __future__ import absolute_import, division, print_function + +import argparse +import logging +import os +import random +import sys +import pickle +import copy + +import numpy as np +import torch +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset +from torch.utils.tensorboard import SummaryWriter +from torch.nn import CrossEntropyLoss, MSELoss + +from transformer import BertForSequenceClassification, WEIGHTS_NAME, CONFIG_NAME +from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification +from transformer import BertTokenizer +from transformer import BertAdam +from transformer import BertConfig +from utils_multiemo import * + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +logger = logging.getLogger() + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths) + return tensor_data, all_label_ids + + +def do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels): + eval_loss = 0 + nb_eval_steps = 0 + preds = [] + + for _, batch_ in enumerate(eval_dataloader): + batch_ = tuple(t.to(device) for t in batch_) + with torch.no_grad(): + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ + logits, _, _ = model(input_ids, segment_ids, input_mask) + + # create eval loss and other metric required by the task + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + if len(preds) == 0: + preds.append(logits.detach().cpu().numpy()) + else: + preds[0] = np.append( + preds[0], logits.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + + preds = preds[0] + if output_mode == "classification": + preds = np.argmax(preds, axis=1) + elif output_mode == "regression": + preds = np.squeeze(preds) + result = compute_metrics(task_name, preds, eval_labels.numpy()) + result['eval_loss'] = eval_loss + return result + + +def soft_cross_entropy(predicts, targets): + student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) + targets_prob = torch.nn.functional.softmax(targets, dim=-1) + return (- targets_prob * student_likelihood).mean() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", + default='data', + type=str, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_dir", + default='models/tinybert', + type=str, + help="The model dir.") + parser.add_argument("--teacher_model", + default=None, + type=str, + help="The models directory.") + parser.add_argument("--student_model", + default=None, + type=str, + help="The models directory.") + parser.add_argument("--task_name", + default='sst-2', + type=str, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default='output', + type=str, + help="The output directory where the model predictions and checkpoints will be written.") + + parser.add_argument("--learning_rate", + default=2e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + + parser.add_argument('--aug_train', + action='store_false', + help="Whether to use augmented data or not") + parser.add_argument('--pred_distill', + action='store_true', + help="Whether to distil with task layer") + parser.add_argument('--intermediate_distill', + action='store_true', + help="Whether to distil with intermediate layers") + parser.add_argument('--save_fp_model', + action='store_true', + help="Whether to save fp32 model") + parser.add_argument('--save_quantized_model', + action='store_true', + help="Whether to save quantized model") + + parser.add_argument("--weight_bits", + default=2, + type=int, + choices=[2, 8], + help="Quantization bits for weight.") + parser.add_argument("--input_bits", + default=8, + type=int, + help="Quantization bits for activation.") + parser.add_argument("--clip_val", + default=2.5, + type=float, + help="Initial clip value.") + + args = parser.parse_args() + assert args.pred_distill or args.intermediate_distill, "'pred_distill' and 'intermediate_distill', at least one must be True" + summaryWriter = SummaryWriter(args.output_dir) + logger.info('The args: {}'.format(args)) + task_name = args.task_name.lower() + data_dir = os.path.join(args.data_dir, task_name) + output_dir = os.path.join(args.output_dir, task_name) + # processed_data_dir = os.path.join(args.data_dir,'preprocessed',task_name) + + if not os.path.exists(output_dir): + os.mkdir(output_dir) + + if args.student_model is None: + args.student_model = os.path.join(args.model_dir, task_name) + if args.teacher_model is None: + args.teacher_model = os.path.join(args.model_dir, task_name) + + processors = { + "multiemo": MultiemoProcessor + } + + output_modes = { + "multiemo": "classification" + } + + default_params = { + "multiemo": {"max_seq_length": 128, "batch_size": 16, "eval_step": 50} + } + + acc_tasks = ["multiemo"] + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + + # Prepare seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if task_name in default_params: + args.batch_size = default_params[task_name]["batch_size"] + if n_gpu > 0: + args.batch_size = int(args.batch_size * n_gpu) + args.max_seq_length = default_params[task_name]["max_seq_length"] + args.eval_step = default_params[task_name]["eval_step"] + + if 'multiemo' in task_name: + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + else: + raise ValueError("Task not found: %s" % task_name) + + if 'multiemo' in task_name: + output_mode = output_modes['multiemo'] + else: + raise ValueError("Task not found: %s" % task_name) + + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=True) + + if args.aug_train: + train_examples = processor.get_aug_examples(data_dir) + train_features = convert_examples_to_features(train_examples, label_list, + args.max_seq_length, tokenizer, output_mode) + else: + train_examples = processor.get_train_examples(data_dir) + train_features = convert_examples_to_features(train_examples, label_list, + args.max_seq_length, tokenizer, output_mode) + + num_train_optimization_steps = int(len(train_features) / args.batch_size) * args.num_train_epochs + train_data, _ = get_tensor_data(output_mode, train_features) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) + + eval_examples = processor.get_dev_examples(data_dir) + eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, + output_mode) + + eval_data, eval_labels = get_tensor_data(output_mode, eval_features) + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) + + teacher_model = BertForSequenceClassification.from_pretrained(args.teacher_model) + teacher_model.to(device) + teacher_model.eval() + if n_gpu > 1: + teacher_model = torch.nn.DataParallel(teacher_model) + + result = do_eval(teacher_model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + fp32_performance = f"f1/acc:{result['f1']}/{result['acc']}" + fp32_performance = task_name + ' fp32 ' + fp32_performance + + student_config = BertConfig.from_pretrained( + args.teacher_model, + quantize_act=True, + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val + ) + student_model = QuantBertForSequenceClassification.from_pretrained(args.student_model, config=student_config, + num_labels=num_labels) + student_model.to(device) + + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_features)) + logger.info(" Batch size = %d", args.batch_size) + logger.info(" Num steps = %d", num_train_optimization_steps) + if n_gpu > 1: + student_model = torch.nn.DataParallel(student_model) + + # Prepare optimizer + param_optimizer = list(student_model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + schedule = 'warmup_linear' + optimizer = BertAdam(optimizer_grouped_parameters, + schedule=schedule, + lr=args.learning_rate, + warmup=0.1, + t_total=num_train_optimization_steps) + loss_mse = MSELoss() + global_step = 0 + best_dev_acc = 0.0 + previous_best = None + + tr_loss = 0. + tr_att_loss = 0. + tr_rep_loss = 0. + tr_cls_loss = 0. + for epoch_ in range(int(args.num_train_epochs)): + nb_tr_examples, nb_tr_steps = 0, 0 + + for step, batch in enumerate(train_dataloader): + student_model.train() + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch + att_loss = 0. + rep_loss = 0. + cls_loss = 0. + loss = 0. + + student_logits, student_atts, student_reps = student_model(input_ids, segment_ids, input_mask) + + with torch.no_grad(): + teacher_logits, teacher_atts, teacher_reps = teacher_model(input_ids, segment_ids, input_mask) + + if args.pred_distill: + if output_mode == "classification": + cls_loss = soft_cross_entropy(student_logits, teacher_logits) + elif output_mode == "regression": + cls_loss = loss_mse(student_logits, teacher_logits) + + loss = cls_loss + tr_cls_loss += cls_loss.item() + + if args.intermediate_distill: + for student_att, teacher_att in zip(student_atts, teacher_atts): + student_att = torch.where(student_att <= -1e2, torch.zeros_like(student_att).to(device), + student_att) + teacher_att = torch.where(teacher_att <= -1e2, torch.zeros_like(teacher_att).to(device), + teacher_att) + tmp_loss = loss_mse(student_att, teacher_att) + att_loss += tmp_loss + + for student_rep, teacher_rep in zip(student_reps, teacher_reps): + tmp_loss = loss_mse(student_rep, teacher_rep) + rep_loss += tmp_loss + + loss += rep_loss + att_loss + tr_att_loss += att_loss.item() + tr_rep_loss += rep_loss.item() + + if n_gpu > 1: + loss = loss.mean() + + loss.backward() + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + tr_loss += loss.item() + nb_tr_examples += label_ids.size(0) + nb_tr_steps += 1 + if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps - 1: + logger.info("***** Running evaluation *****") + logger.info(" {} step of {} steps".format(global_step, num_train_optimization_steps)) + if previous_best is not None: + logger.info(f"{fp32_performance}\nPrevious best = {previous_best}") + + student_model.eval() + + loss = tr_loss / (step + 1) + cls_loss = tr_cls_loss / (step + 1) + att_loss = tr_att_loss / (step + 1) + rep_loss = tr_rep_loss / (step + 1) + + result = do_eval(student_model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + result['global_step'] = global_step + result['cls_loss'] = cls_loss + result['att_loss'] = att_loss + result['rep_loss'] = rep_loss + result['loss'] = loss + summaryWriter.add_scalar('total_loss', loss, global_step) + summaryWriter.add_scalars('distill_loss', {'att_loss': att_loss, + 'rep_loss': rep_loss, + 'cls_loss': cls_loss}, global_step) + + summaryWriter.add_scalars('performance', {'acc': result['acc'], + 'f1': result['f1'], + 'acc_and_f1': result['acc_and_f1']}, global_step) + + save_model = False + + if task_name in acc_tasks and result['acc'] > best_dev_acc: + previous_best = f"f1/acc:{result['f1']}/{result['acc']}" + best_dev_acc = result['acc'] + save_model = True + + if save_model: + logger.info(fp32_performance) + logger.info(previous_best) + if args.save_fp_model: + logger.info("******************** Save full precision model ********************") + model_to_save = student_model.module if hasattr(student_model, 'module') else student_model + output_model_file = os.path.join(output_dir, WEIGHTS_NAME) + output_config_file = os.path.join(output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(output_dir) + if args.save_quantized_model: + logger.info("******************** Save quantized model ********************") + output_quant_dir = os.path.join(output_dir, 'quant') + if not os.path.exists(output_quant_dir): + os.makedirs(output_quant_dir) + model_to_save = student_model.module if hasattr(student_model, 'module') else student_model + quant_model = copy.deepcopy(model_to_save) + for name, module in quant_model.named_modules(): + if hasattr(module, 'weight_quantizer'): + module.weight.data = module.weight_quantizer.apply(module.weight, + module.weight_clip_val, + module.weight_bits, True) + + output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME) + output_config_file = os.path.join(output_quant_dir, CONFIG_NAME) + + torch.save(quant_model.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(output_quant_dir) + + +if __name__ == "__main__": + main() diff --git a/TernaryBERT/scripts/download_dataset.py b/TernaryBERT/scripts/download_dataset.py new file mode 100644 index 00000000..de38d570 --- /dev/null +++ b/TernaryBERT/scripts/download_dataset.py @@ -0,0 +1,54 @@ +import os +import zipfile + +import requests +from tqdm.auto import tqdm + +# url = 'https://clarin-pl.eu/dspace/bitstream/handle/11321/798/multiemo.zip?sequence=2&isAllowed=y' +url = 'https://clarin-pl.eu/dspace/handle/11321/798/allzip' + + +def main(data_dir): + output_zip = os.path.join( + data_dir, + 'MultiEmo_ Multilingual, Multilevel, Multidomain Sentiment Analysis Corpus of Consumer Reviews.zip') + + response = requests.get(url, stream=True) + + if response.status_code == 200: + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + with open(output_zip, 'wb') as f: + for chunk in response.iter_content(chunk_size=block_size): + if chunk: + progress_bar.update(len(chunk)) + f.write(chunk) + + progress_bar.close() + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + print("ERROR, something went wrong") + + with zipfile.ZipFile(output_zip, "r") as zip_ref: + zip_ref.extractall(data_dir) + + os.remove(output_zip) + os.remove(os.path.join(data_dir, 'multiemo.7z')) + + data_output_zip = os.path.join(data_dir, 'multiemo.zip') + with zipfile.ZipFile(data_output_zip, "r") as zip_ref: + zip_ref.extractall(data_dir) + + os.remove(data_output_zip) + os.remove(os.path.join(data_dir, 'README.txt')) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data') + args = parser.parse_args() + + if not os.path.isdir(args.data_dir): + os.mkdir(args.data_dir) + + main(data_dir=args.data_dir) diff --git a/TernaryBERT/utils_glue.py b/TernaryBERT/utils_glue.py index 5a33219f..c19c4108 100644 --- a/TernaryBERT/utils_glue.py +++ b/TernaryBERT/utils_glue.py @@ -8,6 +8,7 @@ logger = logging.getLogger() + class InputExample(object): """A single training/test example for simple sequence classification.""" @@ -50,7 +51,7 @@ def get_train_examples(self, data_dir): def get_dev_examples(self, data_dir): """Gets a collection of `InputExample`s for the dev set.""" raise NotImplementedError() - + def get_test_examples(self, data_dir): """Gets a collection of `InputExample`s for the test set.""" raise NotImplementedError() @@ -84,7 +85,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -128,7 +129,7 @@ def get_dev_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") @@ -167,6 +168,7 @@ def get_dev_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") + def get_test_examples(self, data_dir): """See base class.""" return self._create_examples( @@ -186,7 +188,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -233,7 +235,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -276,7 +278,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -298,7 +300,7 @@ def _create_examples(self, lines, set_type): guid = "%s-%s" % (set_type, line[0]) text_a = line[7] text_b = line[8] - if set_type== 'test': + if set_type == 'test': label = None else: label = line[-1] @@ -319,7 +321,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -340,7 +342,7 @@ def _create_examples(self, lines, set_type): continue guid = "%s-%s" % (set_type, line[0]) try: - if set_type=='test': + if set_type == 'test': text_a = line[1] text_b = line[2] label = None @@ -368,7 +370,7 @@ def get_dev_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -388,7 +390,7 @@ def _create_examples(self, lines, set_type): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) - if set_type=='test': + if set_type == 'test': text_a = line[1] text_b = line[2] label = None @@ -413,7 +415,7 @@ def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - + def get_test_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") @@ -433,7 +435,7 @@ def _create_examples(self, lines, set_type): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) - if set_type=='test': + if set_type == 'test': text_a = line[1] text_b = line[2] label = None diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py new file mode 100644 index 00000000..dcec6cb8 --- /dev/null +++ b/TernaryBERT/utils_multiemo.py @@ -0,0 +1,226 @@ +import os +import logging +import sys +import csv + +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import matthews_corrcoef, f1_score + +logger = logging.getLogger() + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.seq_length = seq_length + self.label_id = label_id + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for the test set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + if sys.version_info[0] == 2: + line = list(unicode(cell, 'utf-8') for cell in line) + lines.append(line) + return lines + + +class MultiemoProcessor(DataProcessor): + """Processor for the Multiemo data2 set""" + + def __init__(self, lang: str, domain: str, kind: str): + super(MultiemoProcessor, self).__init__() + self.lang = lang.lower() + self.domain = domain.lower() + self.kind = kind.lower() + + def get_train_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'train') + logger.info(f"LOOKING AT {file_path}") + return self._create_examples(self._read_txt(file_path), "train") + + def get_dev_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'dev') + return self._create_examples(self._read_txt(file_path), "dev") + + def get_test_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'test') + return self._create_examples(self._read_txt(file_path), "test") + + def get_set_type_path(self, data_dir: str, set_type: str) -> str: + return os.path.join(data_dir, self.domain + '.' + self.kind + '.' + set_type + '.' + self.lang + '.txt') + + def get_labels(self) -> List[str]: + """See base class.""" + if self.kind == 'text': + return ["meta_amb", "meta_minus_m", "meta_plus_m", "meta_zero"] + else: + return ["z_amb", "z_minus_m", "z_plus_m", "z_zero"] + + @staticmethod + def _create_examples(lines: List[str], set_type: str) -> List[InputExample]: + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + split_line = line.split('__label__') + text_a = split_line[0] + label = split_line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer, output_mode): + """Loads a data file into a list of `InputBatch`s.""" + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:(max_seq_length - 2)] + + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + segment_ids = [0] * len(tokens) + + if tokens_b: + tokens += tokens_b + ["[SEP]"] + segment_ids += [1] * (len(tokens_b) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + seq_length = len(input_ids) + + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + try: + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = float(example.label) + else: + raise KeyError(output_mode) + except: + label_id = 0 + + if ex_index < 1: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join( + [str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("label: {}".format(example.label)) + logger.info("label_id: {}".format(label_id)) + + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + seq_length=seq_length)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def simple_accuracy(preds, labels): + return (preds == labels).mean() + + +def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = f1_score(y_true=labels, y_pred=preds) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + +def compute_metrics(task_name, preds, labels): + assert len(preds) == len(labels) + if task_name == "multiemo": + return acc_and_f1(preds, labels) + else: + raise KeyError(task_name) From 26328e0f832d64b724fe2315efdecd746e9865e6 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 19:08:50 +0100 Subject: [PATCH 22/62] feat: add evaluation on test set after training --- ...une_bert.py => multiemo_fine_tune_bert.py} | 106 +++++++++++------ ..._task_polemo.py => quant_task_multiemo.py} | 107 +++++++++++++----- TernaryBERT/scripts/download_dataset.py | 1 + TernaryBERT/utils.py | 39 +++++++ TernaryBERT/utils_multiemo.py | 4 +- 5 files changed, 191 insertions(+), 66 deletions(-) rename TernaryBERT/{fine_tune_bert.py => multiemo_fine_tune_bert.py} (83%) rename TernaryBERT/{quant_task_polemo.py => quant_task_multiemo.py} (83%) create mode 100644 TernaryBERT/utils.py diff --git a/TernaryBERT/fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py similarity index 83% rename from TernaryBERT/fine_tune_bert.py rename to TernaryBERT/multiemo_fine_tune_bert.py index e2c1d1ee..451c7091 100644 --- a/TernaryBERT/fine_tune_bert.py +++ b/TernaryBERT/multiemo_fine_tune_bert.py @@ -29,11 +29,13 @@ import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler) +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from tqdm import tqdm, trange from torch.nn import CrossEntropyLoss, MSELoss +from sklearn.metrics import classification_report +from utils import result_to_text_file from utils_multiemo import * from transformer.modeling import BertForSequenceClassification from transformer.tokenization import BertTokenizer @@ -51,15 +53,6 @@ logger = logging.getLogger() -def result_to_file(result, file_name): - with open(file_name, "a") as writer: - writer.write("") - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info("%s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - def get_tensor_data(output_mode, features): if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) @@ -78,13 +71,12 @@ def do_eval(model, task_name, eval_dataloader, device, output_mode, eval_labels, num_labels): eval_loss = 0 nb_eval_steps = 0 - preds = [] + all_logits = None - for batch_ in tqdm(eval_dataloader, desc="Evaluating"): + for _, batch_ in enumerate(eval_dataloader): batch_ = tuple(t.to(device) for t in batch_) with torch.no_grad(): input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ - logits, _, _ = model(input_ids, segment_ids, input_mask) # create eval loss and other metric required by the task @@ -97,23 +89,19 @@ def do_eval(model, task_name, eval_dataloader, eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 - if len(preds) == 0: - preds.append(logits.detach().cpu().numpy()) + + if all_logits is None: + all_logits = logits.detach().cpu().numpy() else: - preds[0] = np.append( - preds[0], logits.detach().cpu().numpy(), axis=0) + all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps - preds = preds[0] - if output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(task_name, preds, eval_labels.numpy()) + if output_mode == "regression": + all_logits = np.squeeze(all_logits) + result = compute_metrics(task_name, all_logits, eval_labels.numpy()) result['eval_loss'] = eval_loss - - return result + return result, all_logits def main(): @@ -246,11 +234,17 @@ def main(): if task_name in default_params: args.num_train_epoch = default_params[task_name]["num_train_epochs"] - if task_name not in processors: + if 'multiemo' in task_name: + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + else: + raise ValueError("Task not found: %s" % task_name) + + if 'multiemo' in task_name: + output_mode = 'classification' + else: raise ValueError("Task not found: %s" % task_name) - processor = processors[task_name]() - output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) @@ -284,15 +278,14 @@ def main(): model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels) model.to(device) - if args.do_eval: logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() - result = do_eval(model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) + result, _ = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) @@ -385,13 +378,13 @@ def main(): loss = tr_loss / (step + 1) cls_loss = tr_cls_loss / (step + 1) - result = do_eval(model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) + result, _ = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) result['global_step'] = global_step result['cls_loss'] = cls_loss result['loss'] = loss - result_to_file(result, output_eval_file) + result_to_text_file(result, output_eval_file) save_model = False @@ -422,6 +415,51 @@ def main(): model.train() + # Measure End Time + training_end_time = time.monotonic() + + diff = timedelta(seconds=training_end_time - training_start_time) + diff_seconds = diff.total_seconds() + + training_parameters = vars(args) + training_parameters['training_time'] = diff_seconds + + output_training_params_file = os.path.join(args.output_dir, "training_params.json") + dictionary_to_json(training_parameters, output_training_params_file) + + ######################### + # Test model # + ######################### + test_examples = processor.get_test_examples(args.data_dir) + test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer, + output_mode) + + test_data, test_labels = get_tensor_data(output_mode, test_features) + test_sampler = SequentialSampler(eval_data) + test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) + + logger.info("\n***** Running evaluation on test dataset *****") + logger.info(" Num examples = %d", len(test_features)) + logger.info(" Batch size = %d", args.batch_size) + + eval_start_time = time.monotonic() + result, y_logits = do_eval(model, task_name, test_dataloader, + device, output_mode, test_labels, num_labels) + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + result['eval_time'] = diff_seconds + result_to_text_file(result, os.path.join(args.output_dir, "test_results.txt")) + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(y_true, y_pred, target_names=label_list)) + + report = classification_report(y_true, y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + dictionary_to_json(report, os.path.join(args.output_dir, "test_results.json")) + if __name__ == "__main__": main() diff --git a/TernaryBERT/quant_task_polemo.py b/TernaryBERT/quant_task_multiemo.py similarity index 83% rename from TernaryBERT/quant_task_polemo.py rename to TernaryBERT/quant_task_multiemo.py index ca07bfe3..0561cda1 100644 --- a/TernaryBERT/quant_task_polemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -7,12 +7,15 @@ import sys import pickle import copy +import time +from datetime import timedelta import numpy as np import torch from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset -from torch.utils.tensorboard import SummaryWriter from torch.nn import CrossEntropyLoss, MSELoss +from sklearn.metrics import classification_report +from tqdm import trange from transformer import BertForSequenceClassification, WEIGHTS_NAME, CONFIG_NAME from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification @@ -20,6 +23,7 @@ from transformer import BertAdam from transformer import BertConfig from utils_multiemo import * +from utils import dictionary_to_json, result_to_text_file log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, @@ -45,7 +49,7 @@ def do_eval(model, task_name, eval_dataloader, device, output_mode, eval_labels, num_labels): eval_loss = 0 nb_eval_steps = 0 - preds = [] + all_logits = None for _, batch_ in enumerate(eval_dataloader): batch_ = tuple(t.to(device) for t in batch_) @@ -63,22 +67,19 @@ def do_eval(model, task_name, eval_dataloader, eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 - if len(preds) == 0: - preds.append(logits.detach().cpu().numpy()) + + if all_logits is None: + all_logits = logits.detach().cpu().numpy() else: - preds[0] = np.append( - preds[0], logits.detach().cpu().numpy(), axis=0) + all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps - preds = preds[0] - if output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(task_name, preds, eval_labels.numpy()) + if output_mode == "regression": + all_logits = np.squeeze(all_logits) + result = compute_metrics(task_name, all_logits, eval_labels.numpy()) result['eval_loss'] = eval_loss - return result + return result, all_logits def soft_cross_entropy(predicts, targets): @@ -106,7 +107,6 @@ def main(): type=str, help="The models directory.") parser.add_argument("--task_name", - default='sst-2', type=str, help="The name of the task to train.") parser.add_argument("--output_dir", @@ -159,10 +159,9 @@ def main(): args = parser.parse_args() assert args.pred_distill or args.intermediate_distill, "'pred_distill' and 'intermediate_distill', at least one must be True" - summaryWriter = SummaryWriter(args.output_dir) logger.info('The args: {}'.format(args)) task_name = args.task_name.lower() - data_dir = os.path.join(args.data_dir, task_name) + data_dir = os.path.join(args.data_dir) output_dir = os.path.join(args.output_dir, task_name) # processed_data_dir = os.path.join(args.data_dir,'preprocessed',task_name) @@ -250,8 +249,9 @@ def main(): if n_gpu > 1: teacher_model = torch.nn.DataParallel(teacher_model) - result = do_eval(teacher_model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) + result, _ = do_eval(teacher_model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + fp32_performance = f"f1/acc:{result['f1']}/{result['acc']}" fp32_performance = task_name + ' fp32 ' + fp32_performance @@ -266,6 +266,8 @@ def main(): num_labels=num_labels) student_model.to(device) + training_start_time = time.monotonic() + logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", args.batch_size) @@ -290,12 +292,13 @@ def main(): global_step = 0 best_dev_acc = 0.0 previous_best = None + output_eval_file = os.path.join(output_dir, "eval_results.txt") tr_loss = 0. tr_att_loss = 0. tr_rep_loss = 0. tr_cls_loss = 0. - for epoch_ in range(int(args.num_train_epochs)): + for epoch_ in trange(int(args.num_train_epochs)): nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): @@ -362,21 +365,16 @@ def main(): att_loss = tr_att_loss / (step + 1) rep_loss = tr_rep_loss / (step + 1) - result = do_eval(student_model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) + result, _ = do_eval(student_model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + result['global_step'] = global_step result['cls_loss'] = cls_loss result['att_loss'] = att_loss result['rep_loss'] = rep_loss result['loss'] = loss - summaryWriter.add_scalar('total_loss', loss, global_step) - summaryWriter.add_scalars('distill_loss', {'att_loss': att_loss, - 'rep_loss': rep_loss, - 'cls_loss': cls_loss}, global_step) - summaryWriter.add_scalars('performance', {'acc': result['acc'], - 'f1': result['f1'], - 'acc_and_f1': result['acc_and_f1']}, global_step) + result_to_text_file(result, output_eval_file) save_model = False @@ -406,9 +404,11 @@ def main(): quant_model = copy.deepcopy(model_to_save) for name, module in quant_model.named_modules(): if hasattr(module, 'weight_quantizer'): - module.weight.data = module.weight_quantizer.apply(module.weight, - module.weight_clip_val, - module.weight_bits, True) + module.weight.data = module.weight_quantizer.apply( + module.weight, + module.weight_clip_val, + module.weight_bits, True + ) output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_quant_dir, CONFIG_NAME) @@ -417,6 +417,51 @@ def main(): model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_quant_dir) + # Measure End Time + training_end_time = time.monotonic() + + diff = timedelta(seconds=training_end_time - training_start_time) + diff_seconds = diff.total_seconds() + + training_parameters = vars(args) + training_parameters['training_time'] = diff_seconds + + output_training_params_file = os.path.join(output_dir, "training_params.json") + dictionary_to_json(training_parameters, output_training_params_file) + + ######################### + # Test model # + ######################### + test_examples = processor.get_test_examples(data_dir) + test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer, + output_mode) + + test_data, test_labels = get_tensor_data(output_mode, test_features) + test_sampler = SequentialSampler(eval_data) + test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) + + logger.info("\n***** Running evaluation on test dataset *****") + logger.info(" Num examples = %d", len(test_features)) + logger.info(" Batch size = %d", args.batch_size) + + eval_start_time = time.monotonic() + result, y_logits = do_eval(student_model, task_name, test_dataloader, + device, output_mode, test_labels, num_labels) + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + result['eval_time'] = diff_seconds + result_to_text_file(result, os.path.join(output_dir, "test_results.txt")) + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(test_labels.numpy(), y_pred, target_names=label_list)) + + report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + dictionary_to_json(report, os.path.join(output_dir, "test_results.json")) + if __name__ == "__main__": main() diff --git a/TernaryBERT/scripts/download_dataset.py b/TernaryBERT/scripts/download_dataset.py index de38d570..08bdc6fb 100644 --- a/TernaryBERT/scripts/download_dataset.py +++ b/TernaryBERT/scripts/download_dataset.py @@ -1,3 +1,4 @@ +import argparse import os import zipfile diff --git a/TernaryBERT/utils.py b/TernaryBERT/utils.py new file mode 100644 index 00000000..307ea9b2 --- /dev/null +++ b/TernaryBERT/utils.py @@ -0,0 +1,39 @@ +import json +import logging +import os +import sys + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%d/%m/%Y %H:%M:%S') +logger = logging.getLogger(__name__) + + +def result_to_text_file(result: dict, file_name: str, verbose: bool = True) -> None: + with open(file_name, "a") as writer: + if verbose: + logger.info("***** Eval results *****") + + for key in sorted(result.keys()): + if verbose: + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + writer.write("") + + +def dictionary_to_json(dictionary: dict, file_name: str): + with open(file_name, "w") as f: + json.dump(dictionary, f, indent=2) + + +def is_folder_empty(folder_name: str): + if len([f for f in os.listdir(folder_name) if not f.startswith('.')]) == 0: + return True + else: + return False + + +def get_immediate_subdirectories(directory: str): + return [os.path.join(directory, name) for name in os.listdir(directory) + if os.path.isdir(os.path.join(directory, name))] \ No newline at end of file diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py index dcec6cb8..b20e8f61 100644 --- a/TernaryBERT/utils_multiemo.py +++ b/TernaryBERT/utils_multiemo.py @@ -3,6 +3,7 @@ import sys import csv +import numpy as np from scipy.stats import pearsonr, spearmanr from sklearn.metrics import matthews_corrcoef, f1_score @@ -218,7 +219,8 @@ def acc_and_f1(preds, labels): } -def compute_metrics(task_name, preds, labels): +def compute_metrics(task_name, logits, labels): + preds = np.argmax(logits, axis=1) assert len(preds) == len(labels) if task_name == "multiemo": return acc_and_f1(preds, labels) From e549623f988ceaf9c2cf59b514ce44bd369b1abb Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 19:10:15 +0100 Subject: [PATCH 23/62] feat: add Dockerfile --- TernaryBERT/Dockerfile | 25 +++++++++++++++++++++++++ TernaryBERT/requirements.txt | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 TernaryBERT/Dockerfile diff --git a/TernaryBERT/Dockerfile b/TernaryBERT/Dockerfile new file mode 100644 index 00000000..81616e69 --- /dev/null +++ b/TernaryBERT/Dockerfile @@ -0,0 +1,25 @@ +FROM nvidia/cuda:11.2.0-cudnn8-runtime-ubuntu20.04 + +ENV TZ=Europe/Minsk +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone +RUN apt update && \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + apt install --no-install-recommends -y python3.8.12 python3-pip python3-dev python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +RUN python3.8 -m pip install --upgrade pip && \ + python3.8 -m pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 \ + -f https://download.pytorch.org/whl/torch_stable.html + +COPY ./requirements.txt . +RUN python3.8 -m pip install --no-cache-dir -r requirements.txt +RUN rm requirements.txt + +ARG USER_ID +ARG GROUP_ID + +RUN addgroup --gid $GROUP_ID user +RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user +USER user diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt index 7bbcdf0a..f86b0709 100644 --- a/TernaryBERT/requirements.txt +++ b/TernaryBERT/requirements.txt @@ -4,4 +4,4 @@ scipy future Pillow tensorflow==1.14.0 -torch==1.1.0 +# torch==1.1.0 From d94594abc746a1b1ecc5b5610075f5fd05372cc5 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 19:47:57 +0100 Subject: [PATCH 24/62] feat: add experiments script --- TernaryBERT/multiemo_fine_tune_bert.py | 15 +++-- TernaryBERT/quant_task_multiemo.py | 30 ++++++--- TernaryBERT/run_experiments.py | 85 +++++++++++++++++++++++++ TernaryBERT/scripts/download_dataset.py | 2 +- TernaryBERT/utils_multiemo.py | 1 + 5 files changed, 118 insertions(+), 15 deletions(-) create mode 100644 TernaryBERT/run_experiments.py diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py index 451c7091..a41b4d69 100644 --- a/TernaryBERT/multiemo_fine_tune_bert.py +++ b/TernaryBERT/multiemo_fine_tune_bert.py @@ -26,6 +26,8 @@ import os import random import sys +import time +from datetime import timedelta import numpy as np import torch @@ -35,7 +37,7 @@ from torch.nn import CrossEntropyLoss, MSELoss from sklearn.metrics import classification_report -from utils import result_to_text_file +from utils import result_to_text_file, dictionary_to_json from utils_multiemo import * from transformer.modeling import BertForSequenceClassification from transformer.tokenization import BertTokenizer @@ -150,7 +152,7 @@ def main(): type=float, help="The initial learning rate for Adam.") parser.add_argument('--weight_decay', '--wd', - default=1e-4, + default=0.01, type=float, metavar='W', help='weight decay') @@ -290,6 +292,8 @@ def main(): for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) else: + training_start_time = time.monotonic() + logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) @@ -306,7 +310,8 @@ def main(): logger.info('Total parameters: {}'.format(size)) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] schedule = 'warmup_linear' @@ -454,9 +459,9 @@ def main(): y_pred = np.argmax(y_logits, axis=1) print('\n\t**** Classification report ****\n') - print(classification_report(y_true, y_pred, target_names=label_list)) + print(classification_report(test_labels.numpy(), y_pred, target_names=label_list)) - report = classification_report(y_true, y_pred, target_names=label_list, output_dict=True) + report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True) report['eval_time'] = diff_seconds dictionary_to_json(report, os.path.join(args.output_dir, "test_results.json")) diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py index 0561cda1..8cc79122 100644 --- a/TernaryBERT/quant_task_multiemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -114,6 +114,9 @@ def main(): type=str, help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") parser.add_argument("--learning_rate", default=2e-5, type=float, @@ -122,6 +125,12 @@ def main(): default=3.0, type=float, help="Total number of training epochs to perform.") + parser.add_argument('--weight_decay', '--wd', + default=0.01, + type=float, + metavar='W', + help='weight decay') + parser.add_argument('--seed', type=int, default=42, @@ -161,9 +170,8 @@ def main(): assert args.pred_distill or args.intermediate_distill, "'pred_distill' and 'intermediate_distill', at least one must be True" logger.info('The args: {}'.format(args)) task_name = args.task_name.lower() - data_dir = os.path.join(args.data_dir) + data_dir = args.data_dir output_dir = os.path.join(args.output_dir, task_name) - # processed_data_dir = os.path.join(args.data_dir,'preprocessed',task_name) if not os.path.exists(output_dir): os.mkdir(output_dir) @@ -219,7 +227,7 @@ def main(): label_list = processor.get_labels() num_labels = len(label_list) - tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=True) + tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=args.do_lower_case) if args.aug_train: train_examples = processor.get_aug_examples(data_dir) @@ -278,16 +286,20 @@ def main(): # Prepare optimizer param_optimizer = list(student_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] schedule = 'warmup_linear' - optimizer = BertAdam(optimizer_grouped_parameters, - schedule=schedule, - lr=args.learning_rate, - warmup=0.1, - t_total=num_train_optimization_steps) + optimizer = BertAdam( + optimizer_grouped_parameters, + schedule=schedule, + lr=args.learning_rate, + warmup=0.1, + t_total=num_train_optimization_steps + ) loss_mse = MSELoss() global_step = 0 best_dev_acc = 0.0 diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py new file mode 100644 index 00000000..d4e0d94a --- /dev/null +++ b/TernaryBERT/run_experiments.py @@ -0,0 +1,85 @@ +import logging +import os +import sys + +PROJECT_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data') + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%d/%m/%Y %H:%M:%S') +logger = logging.getLogger(__name__) + +data_dir = os.path.join('data', 'multiemo2') + +num_train_epochs = 3 +learning_rate = 5e-5 +weight_decay = 0.01 + + +def main(): + os.chdir(PROJECT_FOLDER) + + if not os.path.exists(os.path.join(DATA_FOLDER, 'multiemo2')): + logger.info("Downloading Multiemo data") + cmd = 'python3 -m scripts.download_dataset --data_dir data/multiemo2' + run_process(cmd) + logger.info("Downloading finished") + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')): + logger.info("Downloading bert-base-uncased model") + cmd = 'python3 -m download_bert_base' + run_process(cmd) + logger.info("Downloading finished") + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')): + logger.info("Downloading bert-base-uncased model") + cmd = 'python3 -m download_bert_base' + run_process(cmd) + logger.info("Downloading finished") + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased', 'multiemo_en_all_sentence')): + cmd = 'python3 -m multiemo_fine_tune_bert ' + options = [ + '--pretrained_model', 'data/models/bert-base-uncased', + '--data_dir', 'data/multiemo2', + '--task_name', 'multiemo_en_all_sentence', + '--output_dir', 'data/models/bert-base-uncased/multiemo_en_all_sentence', + '--do_lower_case' + ] + cmd += ' '.join(options) + logger.info(f"Training bert-base-uncased for multiemo_en_all_sentence") + run_process(cmd) + + cmd = 'python3 -m quant_task_multiemo ' + options = [ + '--data_dir', 'data/multiemo2', + '--model_dir ', 'data/models/bert-base-uncased', + '--task_name', 'multiemo_en_all_sentence', + '--output_dir', 'data/models/ternarybert', + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--weight_decay', str(weight_decay), + '--weight_bits', str(2), + '--input_bits', str(8), + '--pred_distill', + '--intermediate_distill', + '--save_fp_model', + '--save_quantized_model', + '--do_lower_case' + ] + cmd += ' '.join(options) + logger.info(f"Training ternarybert for multiemo_en_all_sentence") + run_process(cmd) + + # cmd = f'python3 -m gather_results --task_name multiemo_en_all_sentence' + # logger.info(f"Gathering results to csv for multiemo_en_all_sentence") + # run_process(cmd) + + +def run_process(proc): + os.system(proc) + + +if __name__ == '__main__': + main() diff --git a/TernaryBERT/scripts/download_dataset.py b/TernaryBERT/scripts/download_dataset.py index 08bdc6fb..701ffd3a 100644 --- a/TernaryBERT/scripts/download_dataset.py +++ b/TernaryBERT/scripts/download_dataset.py @@ -46,7 +46,7 @@ def main(data_dir): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data') + parser.add_argument('--data_dir', help='directory to save data to', type=str, default='multiemo2') args = parser.parse_args() if not os.path.isdir(args.data_dir): diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py index b20e8f61..73241068 100644 --- a/TernaryBERT/utils_multiemo.py +++ b/TernaryBERT/utils_multiemo.py @@ -2,6 +2,7 @@ import logging import sys import csv +from typing import List import numpy as np from scipy.stats import pearsonr, spearmanr From e5e731c66322776d06f6b38027a4cf7f25044fed Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 20:16:54 +0100 Subject: [PATCH 25/62] feat: fix Dockerfile --- TernaryBERT/Dockerfile | 8 ++++---- TernaryBERT/requirements.txt | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/TernaryBERT/Dockerfile b/TernaryBERT/Dockerfile index 81616e69..9e8f0495 100644 --- a/TernaryBERT/Dockerfile +++ b/TernaryBERT/Dockerfile @@ -4,17 +4,17 @@ ENV TZ=Europe/Minsk RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone RUN apt update && \ apt install --no-install-recommends -y build-essential software-properties-common && \ - apt install --no-install-recommends -y python3.8.12 python3-pip python3-dev python3-setuptools python3-distutils && \ + apt install --no-install-recommends -y python3.6 python3-pip python3-dev python3-setuptools python3-distutils && \ apt clean && rm -rf /var/lib/apt/lists/* WORKDIR /app -RUN python3.8 -m pip install --upgrade pip && \ - python3.8 -m pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 \ +RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 \ -f https://download.pytorch.org/whl/torch_stable.html COPY ./requirements.txt . -RUN python3.8 -m pip install --no-cache-dir -r requirements.txt +RUN python3 -m pip install --no-cache-dir -r requirements.txt RUN rm requirements.txt ARG USER_ID diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt index f86b0709..759eb3ae 100644 --- a/TernaryBERT/requirements.txt +++ b/TernaryBERT/requirements.txt @@ -3,5 +3,8 @@ requests scipy future Pillow -tensorflow==1.14.0 +tensorflow~=1.14.0 +numpy~=1.21.2 +pandas~=1.3.3 +scikit-learn~=1.0 # torch==1.1.0 From d63093a87f99180752725ee8a7857248f63c29ee Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 20:33:09 +0100 Subject: [PATCH 26/62] fixup! feat: fix Dockerfile --- TernaryBERT/Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/TernaryBERT/Dockerfile b/TernaryBERT/Dockerfile index 9e8f0495..68c2c107 100644 --- a/TernaryBERT/Dockerfile +++ b/TernaryBERT/Dockerfile @@ -2,10 +2,10 @@ FROM nvidia/cuda:11.2.0-cudnn8-runtime-ubuntu20.04 ENV TZ=Europe/Minsk RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone -RUN apt update && \ - apt install --no-install-recommends -y build-essential software-properties-common && \ - apt install --no-install-recommends -y python3.6 python3-pip python3-dev python3-setuptools python3-distutils && \ - apt clean && rm -rf /var/lib/apt/lists/* +RUN apt-get update && \ + apt-get install --no-install-recommends -y build-essential software-properties-common && \ + apt-get install --no-install-recommends -y python3.6 python3-pip python3-dev python3-setuptools python3-distutils && \ + apt-get clean && rm -rf /var/lib/apt/lists/* WORKDIR /app From ace5d6b0c3e36a48203250788c70182aaac60a06 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 20:47:13 +0100 Subject: [PATCH 27/62] fixup! fixup! feat: fix Dockerfile --- TernaryBERT/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt index 759eb3ae..71ce02ad 100644 --- a/TernaryBERT/requirements.txt +++ b/TernaryBERT/requirements.txt @@ -3,7 +3,7 @@ requests scipy future Pillow -tensorflow~=1.14.0 +# tensorflow~=1.14.0 numpy~=1.21.2 pandas~=1.3.3 scikit-learn~=1.0 From a6c511117560958441527c86a9ca7936d2973de7 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 20:56:03 +0100 Subject: [PATCH 28/62] feat: fix commands in runneing experiment script --- TernaryBERT/run_experiments.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py index d4e0d94a..a2e28acc 100644 --- a/TernaryBERT/run_experiments.py +++ b/TernaryBERT/run_experiments.py @@ -22,24 +22,18 @@ def main(): if not os.path.exists(os.path.join(DATA_FOLDER, 'multiemo2')): logger.info("Downloading Multiemo data") - cmd = 'python3 -m scripts.download_dataset --data_dir data/multiemo2' + cmd = 'python3 scripts/download_dataset.py --data_dir data/multiemo2' run_process(cmd) logger.info("Downloading finished") if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')): logger.info("Downloading bert-base-uncased model") - cmd = 'python3 -m download_bert_base' - run_process(cmd) - logger.info("Downloading finished") - - if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')): - logger.info("Downloading bert-base-uncased model") - cmd = 'python3 -m download_bert_base' + cmd = 'python3 download_bert_base.py' run_process(cmd) logger.info("Downloading finished") if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased', 'multiemo_en_all_sentence')): - cmd = 'python3 -m multiemo_fine_tune_bert ' + cmd = 'python3 multiemo_fine_tune_bert.py ' options = [ '--pretrained_model', 'data/models/bert-base-uncased', '--data_dir', 'data/multiemo2', @@ -51,7 +45,7 @@ def main(): logger.info(f"Training bert-base-uncased for multiemo_en_all_sentence") run_process(cmd) - cmd = 'python3 -m quant_task_multiemo ' + cmd = 'python3 quant_task_multiemo.py ' options = [ '--data_dir', 'data/multiemo2', '--model_dir ', 'data/models/bert-base-uncased', From add490329e4083d2fec2111fd364e2dfad812811 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 21:01:19 +0100 Subject: [PATCH 29/62] fixup! feat: fix commands in runneing experiment script --- TernaryBERT/run_experiments.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py index a2e28acc..c62ef670 100644 --- a/TernaryBERT/run_experiments.py +++ b/TernaryBERT/run_experiments.py @@ -2,7 +2,7 @@ import os import sys -PROJECT_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__)) DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data') log_format = '%(asctime)s %(message)s' @@ -18,6 +18,7 @@ def main(): + print(PROJECT_FOLDER) os.chdir(PROJECT_FOLDER) if not os.path.exists(os.path.join(DATA_FOLDER, 'multiemo2')): From 6265a5f707211814c02c97416a6a1cf3d7420573 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 21:03:00 +0100 Subject: [PATCH 30/62] fixup! fixup! feat: fix commands in runneing experiment script --- TernaryBERT/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt index 71ce02ad..7dc69627 100644 --- a/TernaryBERT/requirements.txt +++ b/TernaryBERT/requirements.txt @@ -7,4 +7,5 @@ Pillow numpy~=1.21.2 pandas~=1.3.3 scikit-learn~=1.0 +tqdm # torch==1.1.0 From 05e145e3dc5494960be81e033e6b369799bd03a8 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 21:17:56 +0100 Subject: [PATCH 31/62] feat: fix loading data --- TernaryBERT/quant_task_multiemo.py | 5 ----- TernaryBERT/utils_multiemo.py | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py index 8cc79122..1d4e959d 100644 --- a/TernaryBERT/quant_task_multiemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -1,16 +1,11 @@ from __future__ import absolute_import, division, print_function import argparse -import logging -import os import random -import sys -import pickle import copy import time from datetime import timedelta -import numpy as np import torch from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.nn import CrossEntropyLoss, MSELoss diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py index 73241068..56238d50 100644 --- a/TernaryBERT/utils_multiemo.py +++ b/TernaryBERT/utils_multiemo.py @@ -74,6 +74,13 @@ def _read_tsv(cls, input_file, quotechar=None): lines.append(line) return lines + @classmethod + def _read_txt(cls, input_file: str) -> List[str]: + """Reads a tab separated value file.""" + with open(input_file, "r", encoding='UTF-8') as f: + lines = f.read().splitlines() + return lines + class MultiemoProcessor(DataProcessor): """Processor for the Multiemo data2 set""" From 406cacbdaf7a7ba2ce2da6b3f9e1ff025c1ab42f Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 21:20:31 +0100 Subject: [PATCH 32/62] feat: handle no directory error --- TernaryBERT/multiemo_fine_tune_bert.py | 4 ++-- TernaryBERT/quant_task_multiemo.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py index a41b4d69..66fa9923 100644 --- a/TernaryBERT/multiemo_fine_tune_bert.py +++ b/TernaryBERT/multiemo_fine_tune_bert.py @@ -224,8 +224,8 @@ def main(): # Prepare task settings if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) - if not os.path.exists(args.output_dir): - os.makedirs(args.output_dir) + + os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py index 1d4e959d..82156815 100644 --- a/TernaryBERT/quant_task_multiemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -168,8 +168,7 @@ def main(): data_dir = args.data_dir output_dir = os.path.join(args.output_dir, task_name) - if not os.path.exists(output_dir): - os.mkdir(output_dir) + os.makedirs(output_dir, exist_ok=True) if args.student_model is None: args.student_model = os.path.join(args.model_dir, task_name) From 58f9d0514f665468d491318e2c21a90d502bb9a1 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 21:26:32 +0100 Subject: [PATCH 33/62] feat: correct metrics counting --- TernaryBERT/utils_multiemo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py index 56238d50..bc99d1c9 100644 --- a/TernaryBERT/utils_multiemo.py +++ b/TernaryBERT/utils_multiemo.py @@ -230,7 +230,7 @@ def acc_and_f1(preds, labels): def compute_metrics(task_name, logits, labels): preds = np.argmax(logits, axis=1) assert len(preds) == len(labels) - if task_name == "multiemo": + if 'multiemo' in task_name: return acc_and_f1(preds, labels) else: raise KeyError(task_name) From 5cddb66f55fb09c039691057cf8c6cb71dd0beee Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sat, 27 Nov 2021 21:37:32 +0100 Subject: [PATCH 34/62] feat: correct fr score calculating and batch size issue --- TernaryBERT/multiemo_fine_tune_bert.py | 9 ++------- TernaryBERT/run_experiments.py | 5 +++++ TernaryBERT/utils_multiemo.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py index 66fa9923..ef867829 100644 --- a/TernaryBERT/multiemo_fine_tune_bert.py +++ b/TernaryBERT/multiemo_fine_tune_bert.py @@ -21,15 +21,10 @@ from __future__ import absolute_import, division, print_function import argparse -import csv -import logging -import os import random -import sys import time from datetime import timedelta -import numpy as np import torch from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from tqdm import tqdm, trange @@ -140,11 +135,11 @@ def main(): action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", - default=32, + default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", - default=32, + default=16, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py index c62ef670..0c0882f4 100644 --- a/TernaryBERT/run_experiments.py +++ b/TernaryBERT/run_experiments.py @@ -12,6 +12,7 @@ data_dir = os.path.join('data', 'multiemo2') +batch_size = 16 num_train_epochs = 3 learning_rate = 5e-5 weight_decay = 0.01 @@ -40,6 +41,10 @@ def main(): '--data_dir', 'data/multiemo2', '--task_name', 'multiemo_en_all_sentence', '--output_dir', 'data/models/bert-base-uncased/multiemo_en_all_sentence', + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--weight_decay', str(weight_decay), + '--train_batch_size', str(batch_size), '--do_lower_case' ] cmd += ' '.join(options) diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py index bc99d1c9..e7ba2a24 100644 --- a/TernaryBERT/utils_multiemo.py +++ b/TernaryBERT/utils_multiemo.py @@ -6,7 +6,7 @@ import numpy as np from scipy.stats import pearsonr, spearmanr -from sklearn.metrics import matthews_corrcoef, f1_score +from sklearn.metrics import matthews_corrcoef, f1_score, accuracy_score logger = logging.getLogger() @@ -214,12 +214,12 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length): def simple_accuracy(preds, labels): - return (preds == labels).mean() + return accuracy_score(y_true=labels, y_pred=preds) def acc_and_f1(preds, labels): - acc = simple_accuracy(preds, labels) - f1 = f1_score(y_true=labels, y_pred=preds) + acc = accuracy_score(y_true=labels, y_pred=preds) + f1 = f1_score(y_true=labels, y_pred=preds, average='macro') return { "acc": acc, "f1": f1, From dc94fc237157fb6b379fd33a71b2c5edea4d4cff Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sun, 28 Nov 2021 09:31:05 +0100 Subject: [PATCH 35/62] feat: refactor training loops --- TernaryBERT/multiemo_fine_tune_bert.py | 161 +++++++++------------- TernaryBERT/quant_task_multiemo.py | 177 +++++++++++++------------ 2 files changed, 156 insertions(+), 182 deletions(-) diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py index ef867829..23439d93 100644 --- a/TernaryBERT/multiemo_fine_tune_bert.py +++ b/TernaryBERT/multiemo_fine_tune_bert.py @@ -167,10 +167,6 @@ def main(): type=int, default=42, help="random seed for initialization") - parser.add_argument('--gradient_accumulation_steps', - type=int, - default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") # added arguments parser.add_argument('--aug_train', @@ -185,19 +181,8 @@ def main(): # intermediate distillation default parameters default_params = { "multiemo": {"num_train_epochs": 3, "max_seq_length": 128}, - "cola": {"num_train_epochs": 3, "max_seq_length": 64}, - "mnli": {"num_train_epochs": 3, "max_seq_length": 128}, - "mrpc": {"num_train_epochs": 3, "max_seq_length": 128}, - "sst-2": {"num_train_epochs": 3, "max_seq_length": 64}, - "sts-b": {"num_train_epochs": 3, "max_seq_length": 128}, - "qqp": {"num_train_epochs": 3, "max_seq_length": 128}, - "qnli": {"num_train_epochs": 3, "max_seq_length": 128}, - "rte": {"num_train_epochs": 5, "max_seq_length": 128} } - - acc_tasks = ["multiemo", "mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] - corr_tasks = ["sts-b"] - mcc_tasks = ["cola"] + acc_tasks = ["multiemo"] # Prepare devices device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") @@ -206,7 +191,6 @@ def main(): logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) - logger.info("device: {} n_gpu: {}".format(device, n_gpu)) # Prepare seed @@ -223,7 +207,6 @@ def main(): os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() - if task_name in default_params: args.max_seq_len = default_params[task_name]["max_seq_length"] @@ -252,14 +235,8 @@ def main(): train_examples = processor.get_train_examples(args.data_dir) else: train_examples = processor.get_aug_examples(args.data_dir) - if args.gradient_accumulation_steps < 1: - raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( - args.gradient_accumulation_steps)) - - args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps - num_train_optimization_steps = int( - len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs + num_train_optimization_steps = int(len(train_examples) / args.train_batch_size) * args.num_train_epochs train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) @@ -295,49 +272,28 @@ def main(): logger.info(" Num steps = %d", num_train_optimization_steps) if n_gpu > 1: model = torch.nn.DataParallel(model) - # Prepare optimizer - param_optimizer = list(model.named_parameters()) - size = 0 - for n, p in model.named_parameters(): - logger.info('n: {}'.format(n)) - size += p.nelement() - - logger.info('Total parameters: {}'.format(size)) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], - 'weight_decay': args.weight_decay}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - schedule = 'warmup_linear' - - optimizer = BertAdam(optimizer_grouped_parameters, - schedule=schedule, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) + + optimizer = get_optimizer(args, model, num_train_optimization_steps) # Train and evaluate global_step = 0 best_dev_acc = 0.0 output_eval_file = os.path.join(args.output_dir, "eval_results.txt") - for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"): + for epoch_ in range(int(args.num_train_epochs)): tr_loss = 0. tr_cls_loss = 0. model.train() nb_tr_examples, nb_tr_steps = 0, 0 - for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", ascii=True)): + for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)): batch = tuple(t.to(device) for t in batch) - input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch if input_ids.size()[0] != args.train_batch_size: continue cls_loss = 0. - logits, _, _ = model(input_ids, segment_ids, input_mask) if output_mode == "classification": @@ -352,68 +308,53 @@ def main(): if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps loss.backward() - tr_loss += loss.item() nb_tr_examples += label_ids.size(0) nb_tr_steps += 1 - if (step + 1) % args.gradient_accumulation_steps == 0: - optimizer.step() - optimizer.zero_grad() - global_step += 1 + optimizer.step() + optimizer.zero_grad() + global_step += 1 - if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 2 or \ - (global_step + 1) == num_train_optimization_steps: - logger.info("***** Running evaluation *****") - logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.eval_batch_size) + logger.info("***** Running evaluation *****") + logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) - model.eval() + model.eval() - loss = tr_loss / (step + 1) - cls_loss = tr_cls_loss / (step + 1) + loss = tr_loss / nb_tr_steps + cls_loss = tr_cls_loss / nb_tr_steps - result, _ = do_eval(model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) - result['global_step'] = global_step - result['cls_loss'] = cls_loss - result['loss'] = loss + result, _ = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + result['epoch'] = epoch_ + 1 + result['global_step'] = global_step + result['cls_loss'] = cls_loss + result['loss'] = loss + result_to_text_file(result, output_eval_file) - result_to_text_file(result, output_eval_file) + save_model = False - save_model = False + if task_name in acc_tasks and result['acc'] > best_dev_acc: + best_dev_acc = result['acc'] + save_model = True - if task_name in acc_tasks and result['acc'] > best_dev_acc: - best_dev_acc = result['acc'] - save_model = True + if save_model: + logger.info("***** Save model *****") + model_to_save = model.module if hasattr(model, 'module') else model - if task_name in corr_tasks and result['corr'] > best_dev_acc: - best_dev_acc = result['corr'] - save_model = True + model_name = WEIGHTS_NAME + output_model_file = os.path.join(args.output_dir, model_name) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) - if task_name in mcc_tasks and result['mcc'] > best_dev_acc: - best_dev_acc = result['mcc'] - save_model = True + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(args.output_dir) - if save_model: - logger.info("***** Save model *****") - model_to_save = model.module if hasattr(model, 'module') else model - - model_name = WEIGHTS_NAME - - output_model_file = os.path.join(args.output_dir, model_name) - output_config_file = os.path.join(args.output_dir, CONFIG_NAME) - - torch.save(model_to_save.state_dict(), output_model_file) - model_to_save.config.to_json_file(output_config_file) - tokenizer.save_vocabulary(args.output_dir) - - model.train() + model.train() # Measure End Time training_end_time = time.monotonic() @@ -436,13 +377,14 @@ def main(): test_data, test_labels = get_tensor_data(output_mode, test_features) test_sampler = SequentialSampler(eval_data) - test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) + test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.batch_size) logger.info("\n***** Running evaluation on test dataset *****") logger.info(" Num examples = %d", len(test_features)) logger.info(" Batch size = %d", args.batch_size) eval_start_time = time.monotonic() + model.eval() result, y_logits = do_eval(model, task_name, test_dataloader, device, output_mode, test_labels, num_labels) eval_end_time = time.monotonic() @@ -461,5 +403,30 @@ def main(): dictionary_to_json(report, os.path.join(args.output_dir, "test_results.json")) +def get_optimizer(args, model, num_train_optimization_steps): + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + size = 0 + for n, p in model.named_parameters(): + logger.info('n: {}'.format(n)) + size += p.nelement() + logger.info('Total parameters: {}'.format(size)) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + schedule = 'warmup_linear' + optimizer = BertAdam( + optimizer_grouped_parameters, + schedule=schedule, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps + ) + return optimizer + + if __name__ == "__main__": main() diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py index 82156815..cf4b1c09 100644 --- a/TernaryBERT/quant_task_multiemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -10,7 +10,7 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.nn import CrossEntropyLoss, MSELoss from sklearn.metrics import classification_report -from tqdm import trange +from tqdm import trange, tqdm from transformer import BertForSequenceClassification, WEIGHTS_NAME, CONFIG_NAME from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification @@ -240,7 +240,6 @@ def main(): eval_examples = processor.get_dev_examples(data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) - eval_data, eval_labels = get_tensor_data(output_mode, eval_features) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) @@ -277,23 +276,7 @@ def main(): if n_gpu > 1: student_model = torch.nn.DataParallel(student_model) - # Prepare optimizer - param_optimizer = list(student_model.named_parameters()) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], - 'weight_decay': args.weight_decay}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - schedule = 'warmup_linear' - optimizer = BertAdam( - optimizer_grouped_parameters, - schedule=schedule, - lr=args.learning_rate, - warmup=0.1, - t_total=num_train_optimization_steps - ) + optimizer = get_optimizer(args, num_train_optimization_steps, student_model) loss_mse = MSELoss() global_step = 0 best_dev_acc = 0.0 @@ -307,7 +290,7 @@ def main(): for epoch_ in trange(int(args.num_train_epochs)): nb_tr_examples, nb_tr_steps = 0, 0 - for step, batch in enumerate(train_dataloader): + for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)): student_model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch @@ -358,70 +341,73 @@ def main(): tr_loss += loss.item() nb_tr_examples += label_ids.size(0) nb_tr_steps += 1 - if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps - 1: - logger.info("***** Running evaluation *****") - logger.info(" {} step of {} steps".format(global_step, num_train_optimization_steps)) - if previous_best is not None: - logger.info(f"{fp32_performance}\nPrevious best = {previous_best}") - - student_model.eval() - - loss = tr_loss / (step + 1) - cls_loss = tr_cls_loss / (step + 1) - att_loss = tr_att_loss / (step + 1) - rep_loss = tr_rep_loss / (step + 1) - - result, _ = do_eval(student_model, task_name, eval_dataloader, - device, output_mode, eval_labels, num_labels) - - result['global_step'] = global_step - result['cls_loss'] = cls_loss - result['att_loss'] = att_loss - result['rep_loss'] = rep_loss - result['loss'] = loss - - result_to_text_file(result, output_eval_file) - - save_model = False - - if task_name in acc_tasks and result['acc'] > best_dev_acc: - previous_best = f"f1/acc:{result['f1']}/{result['acc']}" - best_dev_acc = result['acc'] - save_model = True - - if save_model: - logger.info(fp32_performance) - logger.info(previous_best) - if args.save_fp_model: - logger.info("******************** Save full precision model ********************") - model_to_save = student_model.module if hasattr(student_model, 'module') else student_model - output_model_file = os.path.join(output_dir, WEIGHTS_NAME) - output_config_file = os.path.join(output_dir, CONFIG_NAME) - - torch.save(model_to_save.state_dict(), output_model_file) - model_to_save.config.to_json_file(output_config_file) - tokenizer.save_vocabulary(output_dir) - if args.save_quantized_model: - logger.info("******************** Save quantized model ********************") - output_quant_dir = os.path.join(output_dir, 'quant') - if not os.path.exists(output_quant_dir): - os.makedirs(output_quant_dir) - model_to_save = student_model.module if hasattr(student_model, 'module') else student_model - quant_model = copy.deepcopy(model_to_save) - for name, module in quant_model.named_modules(): - if hasattr(module, 'weight_quantizer'): - module.weight.data = module.weight_quantizer.apply( - module.weight, - module.weight_clip_val, - module.weight_bits, True - ) - - output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME) - output_config_file = os.path.join(output_quant_dir, CONFIG_NAME) - - torch.save(quant_model.state_dict(), output_model_file) - model_to_save.config.to_json_file(output_config_file) - tokenizer.save_vocabulary(output_quant_dir) + + + logger.info("***** Running evaluation *****") + logger.info(" {} step of {} steps".format(global_step, num_train_optimization_steps)) + if previous_best is not None: + logger.info(f"{fp32_performance}\nPrevious best = {previous_best}") + + student_model.eval() + + loss = tr_loss / nb_tr_steps + cls_loss = tr_cls_loss / nb_tr_steps + att_loss = tr_att_loss / nb_tr_steps + rep_loss = tr_rep_loss / nb_tr_steps + + result, _ = do_eval(student_model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + + result['epoch'] = epoch_ + 1 + result['global_step'] = global_step + result['cls_loss'] = cls_loss + result['att_loss'] = att_loss + result['rep_loss'] = rep_loss + result['loss'] = loss + + result_to_text_file(result, output_eval_file) + + save_model = False + + if task_name in acc_tasks and result['acc'] > best_dev_acc: + previous_best = f"f1/acc:{result['f1']}/{result['acc']}" + best_dev_acc = result['acc'] + save_model = True + + if save_model: + logger.info(fp32_performance) + logger.info(previous_best) + if args.save_fp_model: + logger.info("******************** Save full precision model ********************") + model_to_save = student_model.module if hasattr(student_model, 'module') else student_model + output_model_file = os.path.join(output_dir, WEIGHTS_NAME) + output_config_file = os.path.join(output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(output_dir) + + if args.save_quantized_model: + logger.info("******************** Save quantized model ********************") + output_quant_dir = os.path.join(output_dir, 'quant') + if not os.path.exists(output_quant_dir): + os.makedirs(output_quant_dir) + model_to_save = student_model.module if hasattr(student_model, 'module') else student_model + quant_model = copy.deepcopy(model_to_save) + for name, module in quant_model.named_modules(): + if hasattr(module, 'weight_quantizer'): + module.weight.data = module.weight_quantizer.apply( + module.weight, + module.weight_clip_val, + module.weight_bits, True + ) + + output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME) + output_config_file = os.path.join(output_quant_dir, CONFIG_NAME) + + torch.save(quant_model.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(output_quant_dir) # Measure End Time training_end_time = time.monotonic() @@ -444,13 +430,14 @@ def main(): test_data, test_labels = get_tensor_data(output_mode, test_features) test_sampler = SequentialSampler(eval_data) - test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) + test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.batch_size) logger.info("\n***** Running evaluation on test dataset *****") logger.info(" Num examples = %d", len(test_features)) logger.info(" Batch size = %d", args.batch_size) eval_start_time = time.monotonic() + student_model.eval() result, y_logits = do_eval(student_model, task_name, test_dataloader, device, output_mode, test_labels, num_labels) eval_end_time = time.monotonic() @@ -469,5 +456,25 @@ def main(): dictionary_to_json(report, os.path.join(output_dir, "test_results.json")) +def get_optimizer(args, num_train_optimization_steps, student_model): + # Prepare optimizer + param_optimizer = list(student_model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + schedule = 'warmup_linear' + optimizer = BertAdam( + optimizer_grouped_parameters, + schedule=schedule, + lr=args.learning_rate, + warmup=0.1, + t_total=num_train_optimization_steps + ) + return optimizer + + if __name__ == "__main__": main() From 0bf88607c29f07530a585894665a49bed6ddf7e9 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sun, 28 Nov 2021 11:34:47 +0100 Subject: [PATCH 36/62] feat: correct bacth size issue --- TernaryBERT/multiemo_fine_tune_bert.py | 6 +++--- TernaryBERT/quant_task_multiemo.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py index 23439d93..c97e681d 100644 --- a/TernaryBERT/multiemo_fine_tune_bert.py +++ b/TernaryBERT/multiemo_fine_tune_bert.py @@ -70,7 +70,7 @@ def do_eval(model, task_name, eval_dataloader, nb_eval_steps = 0 all_logits = None - for _, batch_ in enumerate(eval_dataloader): + for batch_ in tqdm(eval_dataloader): batch_ = tuple(t.to(device) for t in batch_) with torch.no_grad(): input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ @@ -377,11 +377,11 @@ def main(): test_data, test_labels = get_tensor_data(output_mode, test_features) test_sampler = SequentialSampler(eval_data) - test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.batch_size) + test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.eval_batch_size) logger.info("\n***** Running evaluation on test dataset *****") logger.info(" Num examples = %d", len(test_features)) - logger.info(" Batch size = %d", args.batch_size) + logger.info(" Batch size = %d", args.eval_batch_size) eval_start_time = time.monotonic() model.eval() diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py index cf4b1c09..99efa80e 100644 --- a/TernaryBERT/quant_task_multiemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -46,7 +46,7 @@ def do_eval(model, task_name, eval_dataloader, nb_eval_steps = 0 all_logits = None - for _, batch_ in enumerate(eval_dataloader): + for batch_ in tqdm(eval_dataloader): batch_ = tuple(t.to(device) for t in batch_) with torch.no_grad(): input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ @@ -290,7 +290,7 @@ def main(): for epoch_ in trange(int(args.num_train_epochs)): nb_tr_examples, nb_tr_steps = 0, 0 - for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)): + for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)): student_model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch @@ -342,7 +342,6 @@ def main(): nb_tr_examples += label_ids.size(0) nb_tr_steps += 1 - logger.info("***** Running evaluation *****") logger.info(" {} step of {} steps".format(global_step, num_train_optimization_steps)) if previous_best is not None: From e4da554d2b1f0b05c61ca0044274b8a06ad3a412 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sun, 28 Nov 2021 11:43:00 +0100 Subject: [PATCH 37/62] feat: correct saving condition --- TernaryBERT/multiemo_fine_tune_bert.py | 2 +- TernaryBERT/quant_task_multiemo.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py index c97e681d..57373c9e 100644 --- a/TernaryBERT/multiemo_fine_tune_bert.py +++ b/TernaryBERT/multiemo_fine_tune_bert.py @@ -338,7 +338,7 @@ def main(): save_model = False - if task_name in acc_tasks and result['acc'] > best_dev_acc: + if result['acc'] > best_dev_acc: best_dev_acc = result['acc'] save_model = True diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py index 99efa80e..c91cde7a 100644 --- a/TernaryBERT/quant_task_multiemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -368,7 +368,7 @@ def main(): save_model = False - if task_name in acc_tasks and result['acc'] > best_dev_acc: + if result['acc'] > best_dev_acc: previous_best = f"f1/acc:{result['f1']}/{result['acc']}" best_dev_acc = result['acc'] save_model = True From 8bdb9e272c726bad371771891eda54c7fcf89438 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sun, 28 Nov 2021 12:39:55 +0100 Subject: [PATCH 38/62] feat: correct test data loading --- TernaryBERT/multiemo_fine_tune_bert.py | 2 +- TernaryBERT/quant_task_multiemo.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py index 57373c9e..f4d7eb72 100644 --- a/TernaryBERT/multiemo_fine_tune_bert.py +++ b/TernaryBERT/multiemo_fine_tune_bert.py @@ -377,7 +377,7 @@ def main(): test_data, test_labels = get_tensor_data(output_mode, test_features) test_sampler = SequentialSampler(eval_data) - test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.eval_batch_size) + test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) logger.info("\n***** Running evaluation on test dataset *****") logger.info(" Num examples = %d", len(test_features)) diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py index c91cde7a..cd577792 100644 --- a/TernaryBERT/quant_task_multiemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -132,7 +132,7 @@ def main(): help="random seed for initialization") parser.add_argument('--aug_train', - action='store_false', + action='store_true', help="Whether to use augmented data or not") parser.add_argument('--pred_distill', action='store_true', @@ -429,7 +429,7 @@ def main(): test_data, test_labels = get_tensor_data(output_mode, test_features) test_sampler = SequentialSampler(eval_data) - test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.batch_size) + test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.batch_size) logger.info("\n***** Running evaluation on test dataset *****") logger.info(" Num examples = %d", len(test_features)) From 11ce3b796e8724b274c437f4f223fe3e92949160 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sun, 28 Nov 2021 14:12:26 +0100 Subject: [PATCH 39/62] fixup! feat: correct test data loading --- TernaryBERT/multiemo_fine_tune_bert.py | 2 +- TernaryBERT/quant_task_multiemo.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py index f4d7eb72..8ea25d40 100644 --- a/TernaryBERT/multiemo_fine_tune_bert.py +++ b/TernaryBERT/multiemo_fine_tune_bert.py @@ -376,7 +376,7 @@ def main(): output_mode) test_data, test_labels = get_tensor_data(output_mode, test_features) - test_sampler = SequentialSampler(eval_data) + test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) logger.info("\n***** Running evaluation on test dataset *****") diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py index cd577792..02a006f1 100644 --- a/TernaryBERT/quant_task_multiemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -184,7 +184,7 @@ def main(): } default_params = { - "multiemo": {"max_seq_length": 128, "batch_size": 16, "eval_step": 50} + "multiemo": {"max_seq_length": 128, "batch_size": 16} } acc_tasks = ["multiemo"] @@ -205,7 +205,11 @@ def main(): if n_gpu > 0: args.batch_size = int(args.batch_size * n_gpu) args.max_seq_length = default_params[task_name]["max_seq_length"] - args.eval_step = default_params[task_name]["eval_step"] + elif 'multiemo' in task_name: + args.batch_size = default_params['multiemo']["batch_size"] + if n_gpu > 0: + args.batch_size = int(args.batch_size * n_gpu) + args.max_seq_length = default_params['multiemo']["max_seq_length"] if 'multiemo' in task_name: _, lang, domain, kind = task_name.split('_') @@ -428,7 +432,7 @@ def main(): output_mode) test_data, test_labels = get_tensor_data(output_mode, test_features) - test_sampler = SequentialSampler(eval_data) + test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.batch_size) logger.info("\n***** Running evaluation on test dataset *****") From 60555878b777ac79cf76369351f0be7dc8cda5db Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Sun, 28 Nov 2021 15:52:16 +0100 Subject: [PATCH 40/62] feat: fix loading teacher model --- TernaryBERT/quant_task_multiemo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py index 02a006f1..4a0c43b7 100644 --- a/TernaryBERT/quant_task_multiemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -248,7 +248,7 @@ def main(): eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) - teacher_model = BertForSequenceClassification.from_pretrained(args.teacher_model) + teacher_model = BertForSequenceClassification.from_pretrained(args.teacher_model, num_labels=num_labels) teacher_model.to(device) teacher_model.eval() if n_gpu > 1: From 98379e51d2121ebdbb2f440ad4ff3f7edfbae4ec Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 30 Nov 2021 10:40:06 +0100 Subject: [PATCH 41/62] feat: add data processing for multiemo --- .../transformers/data/metrics/__init__.py | 19 ++ .../transformers/data/processors/multiemo.py | 186 ++++++++++++++++++ .../transformers/data/processors/utils.py | 9 + 3 files changed, 214 insertions(+) create mode 100644 DynaBERT/transformers/data/processors/multiemo.py diff --git a/DynaBERT/transformers/data/metrics/__init__.py b/DynaBERT/transformers/data/metrics/__init__.py index c9ebaac3..942bd7b1 100644 --- a/DynaBERT/transformers/data/metrics/__init__.py +++ b/DynaBERT/transformers/data/metrics/__init__.py @@ -47,6 +47,16 @@ def acc_and_f1(preds, labels): } + def multiclass_acc_and_f1(preds, labels): + acc = accuracy_score(y_true=labels, y_pred=preds) + f1 = f1_score(y_true=labels, y_pred=preds, average='macro') + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + def pearson_and_spearman(preds, labels): pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] @@ -81,3 +91,12 @@ def glue_compute_metrics(task_name, preds, labels): return {"acc": simple_accuracy(preds, labels)} else: raise KeyError(task_name) + + + def multiemo_compute_metrics(task_name, logits, labels): + preds = np.argmax(logits, axis=1) + assert len(preds) == len(labels) + if 'multiemo' in task_name: + return multiclass_acc_and_f1(preds, labels) + else: + raise KeyError(task_name) diff --git a/DynaBERT/transformers/data/processors/multiemo.py b/DynaBERT/transformers/data/processors/multiemo.py new file mode 100644 index 00000000..be4a0eb2 --- /dev/null +++ b/DynaBERT/transformers/data/processors/multiemo.py @@ -0,0 +1,186 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" GLUE processors and helpers """ + +import logging +import os +import numpy as np +from .utils import DataProcessor, InputExample, InputFeatures +from ...file_utils import is_tf_available + +if is_tf_available(): + import tensorflow as tf + +logger = logging.getLogger(__name__) + + +def multiemo_convert_examples_to_features(examples, tokenizer, + max_length=512, + task=None, + label_list=None, + output_mode=None, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True): + """ + Loads a data file into a list of ``InputFeatures`` + + Args: + examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. + tokenizer: Instance of a tokenizer that will tokenize the examples + max_length: Maximum example length + task: GLUE task + label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method + output_mode: String indicating the output mode. Either ``regression`` or ``classification`` + pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) + pad_token: Padding token + pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) + mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values + and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for + actual values) + + Returns: + If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` + containing the task-specific features. If the input is a list of ``InputExamples``, will return + a list of task-specific ``InputFeatures`` which can be fed to the model. + + """ + + if task is not None: + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + if label_list is None: + label_list = processor.get_labels() + logger.info("Using label list %s for task %s" % (label_list, task)) + if output_mode is None: + output_mode = multiemo_output_modes_output_modes[task] + logger.info("Using output mode %s for task %s" % (output_mode, task)) + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d" % (ex_index)) + + inputs = tokenizer.encode_plus( + example.text_a, + example.text_b, + add_special_tokens=True, + max_length=max_length, + ) + input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = max_length - len(input_ids) + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask + token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids + else: + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) + + assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) + assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), + max_length) + assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), + max_length) + if output_mode == "classification": + label = label_map[example.label] + elif output_mode == "regression": + label = float(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 1: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) + logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) + logger.info("label: %s (id = %d)" % (example.label, label)) + + features.append( + InputFeatures(input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label)) + + return features + + +class MultiemoProcessor(DataProcessor): + """Processor for the Multiemo data2 set""" + + def __init__(self, lang: str, domain: str, kind: str): + super(MultiemoProcessor, self).__init__() + self.lang = lang.lower() + self.domain = domain.lower() + self.kind = kind.lower() + + def get_train_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'train') + logger.info(f"LOOKING AT {file_path}") + return self._create_examples(self._read_txt(file_path), "train") + + def get_dev_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'dev') + return self._create_examples(self._read_txt(file_path), "dev") + + def get_test_examples(self, data_dir: str) -> List[InputExample]: + """See base class.""" + file_path = self.get_set_type_path(data_dir, 'test') + return self._create_examples(self._read_txt(file_path), "test") + + def get_set_type_path(self, data_dir: str, set_type: str) -> str: + return os.path.join(data_dir, self.domain + '.' + self.kind + '.' + set_type + '.' + self.lang + '.txt') + + def get_labels(self) -> List[str]: + """See base class.""" + if self.kind == 'text': + return ["meta_amb", "meta_minus_m", "meta_plus_m", "meta_zero"] + else: + return ["z_amb", "z_minus_m", "z_plus_m", "z_zero"] + + @staticmethod + def _create_examples(lines: List[str], set_type: str) -> List[InputExample]: + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + split_line = line.split('__label__') + text_a = split_line[0] + label = split_line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +multiemo_tasks_num_labels = { + "multiemo": 4, +} + +multiemo_output_modes = { + "multiemo": "classification" +} diff --git a/DynaBERT/transformers/data/processors/utils.py b/DynaBERT/transformers/data/processors/utils.py index 2d7628f9..1556ef02 100644 --- a/DynaBERT/transformers/data/processors/utils.py +++ b/DynaBERT/transformers/data/processors/utils.py @@ -19,6 +19,7 @@ import copy import json + class InputExample(object): """ A single training/test example for simple sequence classification. @@ -32,6 +33,7 @@ class InputExample(object): label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ + def __init__(self, guid, text_a, text_b=None, label=None): self.guid = guid self.text_a = text_a @@ -123,3 +125,10 @@ def _read_tsv(cls, input_file, quotechar=None): line = list(unicode(cell, 'utf-8') for cell in line) lines.append(line) return lines + + @classmethod + def _read_txt(cls, input_file: str) -> List[str]: + """Reads a tab separated value file.""" + with open(input_file, "r", encoding='UTF-8') as f: + lines = f.read().splitlines() + return lines From 51d250bacc73e70f6b60295a0643f16a284fee76 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 30 Nov 2021 16:02:11 +0100 Subject: [PATCH 42/62] feat: add training dynabert for multiemo --- DynaBERT/download_bert_base.py | 31 + DynaBERT/multiemo_fine_tune_bert.py | 429 +++++++++++++ DynaBERT/run_glue.py | 36 +- DynaBERT/run_multiemo.py | 577 ++++++++++++++++++ DynaBERT/scripts/download_dataset.py | 55 ++ .../transformers/data/metrics/__init__.py | 2 + DynaBERT/utils.py | 33 + 7 files changed, 1145 insertions(+), 18 deletions(-) create mode 100644 DynaBERT/download_bert_base.py create mode 100644 DynaBERT/multiemo_fine_tune_bert.py create mode 100644 DynaBERT/run_multiemo.py create mode 100644 DynaBERT/scripts/download_dataset.py create mode 100644 DynaBERT/utils.py diff --git a/DynaBERT/download_bert_base.py b/DynaBERT/download_bert_base.py new file mode 100644 index 00000000..fa99e41a --- /dev/null +++ b/DynaBERT/download_bert_base.py @@ -0,0 +1,31 @@ +import os +import requests +import tarfile + +url = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz' + +output_path = os.path.join('data', 'models') +os.makedirs(output_path, exist_ok=True) + +output_tar = os.path.join(output_path, 'bert-base-uncased.tar.gz') +model_folder = os.path.join(output_path, 'bert-base-uncased') + +response = requests.get(url, stream=True) +if response.status_code == 200: + with open(output_tar, 'wb') as f: + f.write(response.raw.read()) + +with tarfile.open(name=output_tar, mode="r|gz") as tar_ref: + tar_ref.extractall(model_folder) + +os.rename(os.path.join(model_folder, 'bert_config.json'), os.path.join(model_folder, 'config.json')) + +os.remove(output_tar) + +url_vocab = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt' +r = requests.get(url_vocab) + +with open(os.path.join(model_folder, 'vocab.txt'), 'wb') as f: + f.write(r.content) + +print('Completed!') diff --git a/DynaBERT/multiemo_fine_tune_bert.py b/DynaBERT/multiemo_fine_tune_bert.py new file mode 100644 index 00000000..e0f92f0c --- /dev/null +++ b/DynaBERT/multiemo_fine_tune_bert.py @@ -0,0 +1,429 @@ +# coding=utf-8 +# 2019.12.2-Changed for TinyBERT task-specific distillation +# Huawei Technologies Co., Ltd. +# Copyright 2020 Huawei Technologies Co., Ltd. +# Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import csv +import logging +import os +import random +import sys +import time +from datetime import timedelta + +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) +from tqdm import tqdm, trange + +from sklearn.metrics import classification_report + +from utils import result_to_text_file, dictionary_to_json +from transformers.modeling_bert import BertForSequenceClassification +from transformers.tokenization_bert import BertTokenizer +from transformers.data.metrics import multiemo_compute_metrics as compute_metrics +from transformers.data.processors.multiemo import multiemo_convert_examples_to_features as convert_examples_to_features, \ + MultiemoProcessor +from transformers import AdamW, WarmupLinearSchedule + +from transformers.file_utils import WEIGHTS_NAME, CONFIG_NAME + +csv.field_size_limit(sys.maxsize) + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +fh = logging.FileHandler('debug_layer_loss.log') +fh.setFormatter(logging.Formatter(log_format)) +logging.getLogger().addHandler(fh) +logger = logging.getLogger() + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) + return tensor_data, all_label_ids + + +def do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels): + eval_loss = 0 + nb_eval_steps = 0 + all_logits = None + + for batch in tqdm(eval_dataloader): + model.eval() + batch = tuple(t.to(device) for t in batch) + with torch.no_grad(): + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} + outputs = model(**inputs) + tmp_eval_loss, logits = outputs[:2] + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + + if all_logits is None: + all_logits = logits.detach().cpu().numpy() + else: + all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + + if output_mode == "regression": + all_logits = np.squeeze(all_logits) + result = compute_metrics(task_name, all_logits, eval_labels.numpy()) + result['eval_loss'] = eval_loss + return result, all_logits + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--pretrained_model", + default=None, + type=str, + help="The pretrained model dir.") + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--train_batch_size", + default=16, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=16, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument('--weight_decay', '--wd', + default=0.01, + type=float, + metavar='W', + help='weight decay') + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + + # added arguments + parser.add_argument('--aug_train', + action='store_true') + parser.add_argument('--eval_step', + type=int, + default=50) + + args = parser.parse_args() + logger.info('The args: {}'.format(args)) + + # intermediate distillation default parameters + default_params = { + "multiemo": {"num_train_epochs": 3, "max_seq_length": 128}, + } + acc_tasks = ["multiemo"] + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) + logger.info("device: {} n_gpu: {}".format(device, n_gpu)) + + # Prepare seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + # Prepare task settings + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + + os.makedirs(args.output_dir, exist_ok=True) + + task_name = args.task_name.lower() + if task_name in default_params: + args.max_seq_len = default_params[task_name]["max_seq_length"] + + if not args.do_eval: + if task_name in default_params: + args.num_train_epoch = default_params[task_name]["num_train_epochs"] + + if 'multiemo' in task_name: + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + else: + raise ValueError("Task not found: %s" % task_name) + + if 'multiemo' in task_name: + output_mode = 'classification' + else: + raise ValueError("Task not found: %s" % task_name) + + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case) + + if not args.do_eval: + if not args.aug_train: + train_examples = processor.get_train_examples(args.data_dir) + else: + train_examples = processor.get_aug_examples(args.data_dir) + + t_total = len(train_examples) // args.gradient_accumulation_steps * args.num_train_epochs + + train_features = convert_examples_to_features( + train_examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=False, + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=0, + ) + train_data, _ = get_tensor_data(output_mode, train_features) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + + eval_examples = processor.get_dev_examples(args.data_dir) + eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) + eval_data, eval_labels = get_tensor_data(output_mode, eval_features) + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) + + model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels) + model.to(device) + if args.do_eval: + logger.info("***** Running evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + result, _ = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + else: + training_start_time = time.monotonic() + + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_examples)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", t_total) + if n_gpu > 1: + model = torch.nn.DataParallel(model) + + optimizer, scheduler = get_optimizer_and_scheduler(args, model, t_total) + + # Train and evaluate + global_step = 0 + best_dev_acc = 0.0 + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + + for epoch_ in range(int(args.num_train_epochs)): + tr_loss = 0. + tr_cls_loss = 0. + + model.train() + nb_tr_steps = 0 + + for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)): + batch = tuple(t.to(device) for t in batch) + batch = tuple(t.to(args.device) for t in batch) + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3], + 'token_type_ids': batch[2] if args.model_type in ['bert'] else None} + + cls_loss = model(**inputs)[0] + + loss = cls_loss + tr_cls_loss += cls_loss.item() + + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + + loss.backward() + tr_loss += loss.item() + nb_tr_steps += 1 + + optimizer.step() + scheduler.step() + model.zero_grad() + global_step += 1 + + logger.info("***** Running evaluation *****") + logger.info(" Epoch = {} iter {} step".format(epoch_, global_step)) + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + + model.eval() + + loss = tr_loss / nb_tr_steps + cls_loss = tr_cls_loss / nb_tr_steps + + result, _ = do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels) + result['epoch'] = epoch_ + 1 + result['global_step'] = global_step + result['cls_loss'] = cls_loss + result['loss'] = loss + result_to_text_file(result, output_eval_file) + + save_model = False + + if result['acc'] > best_dev_acc: + best_dev_acc = result['acc'] + save_model = True + + if save_model: + logger.info("***** Save model *****") + model_to_save = model.module if hasattr(model, 'module') else model + + model_name = WEIGHTS_NAME + output_model_file = os.path.join(args.output_dir, model_name) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(args.output_dir) + + model.train() + + # Measure End Time + training_end_time = time.monotonic() + + diff = timedelta(seconds=training_end_time - training_start_time) + diff_seconds = diff.total_seconds() + + training_parameters = vars(args) + training_parameters['training_time'] = diff_seconds + + output_training_params_file = os.path.join(args.output_dir, "training_params.json") + dictionary_to_json(training_parameters, output_training_params_file) + + ######################### + # Test model # + ######################### + test_examples = processor.get_test_examples(args.data_dir) + test_features = convert_examples_to_features( + test_examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=False, + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=0 + ) + + test_data, test_labels = get_tensor_data(output_mode, test_features) + test_sampler = SequentialSampler(test_data) + test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) + + logger.info("\n***** Running evaluation on test dataset *****") + logger.info(" Num examples = %d", len(test_features)) + logger.info(" Batch size = %d", args.eval_batch_size) + + eval_start_time = time.monotonic() + model.eval() + result, y_logits = do_eval(model, task_name, test_dataloader, + device, output_mode, test_labels, num_labels) + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + result['eval_time'] = diff_seconds + result_to_text_file(result, os.path.join(args.output_dir, "test_results.txt")) + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(test_labels.numpy(), y_pred, target_names=label_list)) + + report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + dictionary_to_json(report, os.path.join(args.output_dir, "test_results.json")) + + +def get_optimizer_and_scheduler(args, model, t_total): + # Prepare optimizer + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, + {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + return optimizer, scheduler + + +if __name__ == "__main__": + main() diff --git a/DynaBERT/run_glue.py b/DynaBERT/run_glue.py index 68ce9718..e29362ee 100644 --- a/DynaBERT/run_glue.py +++ b/DynaBERT/run_glue.py @@ -29,12 +29,12 @@ import torch from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from tqdm import tqdm, trange -from torch.nn import MSELoss +from torch.nn import MSELoss from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer, - RobertaConfig, - RobertaForSequenceClassification, - RobertaTokenizer) + RobertaConfig, + RobertaForSequenceClassification, + RobertaTokenizer) from transformers import AdamW, WarmupLinearSchedule @@ -43,11 +43,11 @@ from transformers import glue_processors as processors from transformers import glue_convert_examples_to_features as convert_examples_to_features - logger = logging.getLogger(__name__) CONFIG_NAME = "config.json" WEIGHTS_NAME = "pytorch_model.bin" + def soft_cross_entropy(predicts, targets): student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) targets_prob = torch.nn.functional.softmax(targets, dim=-1) @@ -82,7 +82,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None): # Prepare optimizer and schedule (linear warmup and decay) if args.model_type == 'roberta': - args.warmup_steps = int(t_total*0.06) + args.warmup_steps = int(t_total * 0.06) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ @@ -90,7 +90,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None): 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) @@ -210,7 +210,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None): if global_step > 0 and args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.evaluate_during_training: acc = [] - if args.task_name == "mnli": # for both MNLI-m and MNLI-mm + if args.task_name == "mnli": # for both MNLI-m and MNLI-mm acc_both = [] # collect performance of all sub-networks @@ -240,7 +240,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None): else: print("***best***{}\n".format(acc)) with open(output_eval_file, "a") as writer: - writer.write("{}\n" .format(acc)) + writer.write("{}\n".format(acc)) logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr(model, 'module') else model @@ -307,11 +307,11 @@ def evaluate(args, model, tokenizer, prefix=""): preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) if eval_task == 'mnli-mm': - results.update({'acc_mm':result['acc']}) + results.update({'acc_mm': result['acc']}) else: results.update(result) - output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") # wirte all the results to the same file + output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") # wirte all the results to the same file with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): @@ -322,7 +322,6 @@ def evaluate(args, model, tokenizer, prefix=""): def load_and_cache_examples(args, task, tokenizer, evaluate=False): - processor = processors[task]() output_mode = output_modes[task] logger.info("Creating features from dataset file at %s", args.data_dir) @@ -338,10 +337,10 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, - pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet + pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, - ) + ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) @@ -405,7 +404,7 @@ def compute_neuron_head_importance(args, model, tokenizer): for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) input_ids, input_mask, _, label_ids = batch - segment_ids = batch[2] if args.model_type=='bert' else None # RoBERTa does't use segment_ids + segment_ids = batch[2] if args.model_type == 'bert' else None # RoBERTa does't use segment_ids # calculate head importance outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, @@ -415,7 +414,8 @@ def compute_neuron_head_importance(args, model, tokenizer): head_importance += head_mask.grad.abs().detach() # calculate neuron importance - for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight, neuron_importance): + for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight, + neuron_importance): current_importance += ((w1 * w1.grad).sum(dim=1) + b1 * b1.grad).abs().detach() current_importance += ((w2 * w2.grad).sum(dim=0)).abs().detach() @@ -515,7 +515,7 @@ def main(): args.depth_mult_list = [float(depth) for depth in args.depth_mult_list.split(',')] # Setup CUDA, GPU & distributed training - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device @@ -535,7 +535,7 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.model_dir, num_labels=num_labels, finetuning_task=args.task_name) - config.output_attentions, config.output_hidden_states, config.output_intermediate = True,True,True + config.output_attentions, config.output_hidden_states, config.output_intermediate = True, True, True tokenizer = tokenizer_class.from_pretrained(args.model_dir, do_lower_case=args.do_lower_case) # load teacher model if necessary diff --git a/DynaBERT/run_multiemo.py b/DynaBERT/run_multiemo.py new file mode 100644 index 00000000..3eb3bd05 --- /dev/null +++ b/DynaBERT/run_multiemo.py @@ -0,0 +1,577 @@ +# coding=utf-8 +# 2020.08.28 - Changed regular fine-tuning to fine-tuning with adaptive width and depth +# Huawei Technologies Co., Ltd +# Copyright (c) 2020, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright 2018 The Google AI Language Team Authors, the HuggingFace Inc. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa).""" + +from __future__ import absolute_import, division, print_function + +import argparse +import logging +import os +import random +import math +import time +from datetime import timedelta + +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) +from tqdm import tqdm, trange +from torch.nn import MSELoss + +from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer, + RobertaConfig, + RobertaForSequenceClassification, + RobertaTokenizer) + +from transformers import AdamW, WarmupLinearSchedule + +from transformers.data.metrics import multiemo_compute_metrics as compute_metrics +from transformers.data.processors.multiemo import multiemo_convert_examples_to_features as convert_examples_to_features, \ + MultiemoProcessor, multiemo_output_modes +from utils import result_to_text_file, dictionary_to_json + +logger = logging.getLogger(__name__) +CONFIG_NAME = "config.json" +WEIGHTS_NAME = "pytorch_model.bin" + + +def soft_cross_entropy(predicts, targets): + student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) + targets_prob = torch.nn.functional.softmax(targets, dim=-1) + return -torch.sum(targets_prob * student_likelihood, dim=-1).mean() + + +loss_mse = MSELoss() +ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig)), ()) +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), + 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer) +} + + +def set_seed(args): + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if args.n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + +def train(args, train_dataset, model, tokenizer, teacher_model=None): + """ Train the model """ + + args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) + train_sampler = RandomSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, + batch_size=args.train_batch_size) + + t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs + + # Prepare optimizer and schedule (linear warmup and decay) + if args.model_type == 'roberta': + args.warmup_steps = int(t_total * 0.06) + + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, + {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + global_step = 0 + tr_loss = 0.0 + model.zero_grad() + train_iterator = trange(int(args.num_train_epochs), desc="Epoch") + set_seed(args) + + current_best = 0 + output_eval_file = os.path.join(args.output_dir, 'eval_results.txt') + + for epoch in train_iterator: + epoch_iterator = tqdm(train_dataloader, desc="Iteration") + for step, batch in enumerate(epoch_iterator): + model.train() + batch = tuple(t.to(args.device) for t in batch) + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3], + 'token_type_ids': batch[2] if args.model_type in ['bert'] else None} + + # prepare the hidden states and logits of the teacher model + if args.training_phase == 'dynabertw' and teacher_model: + with torch.no_grad(): + _, teacher_logit, teacher_reps, _, _ = teacher_model(**inputs) + elif args.training_phase == 'dynabert' and teacher_model: + hidden_max_all, logits_max_all = [], [] + for width_mult in sorted(args.width_mult_list, reverse=True): + with torch.no_grad(): + _, teacher_logit, teacher_reps, _, _ = teacher_model(**inputs) + hidden_max_all.append(teacher_reps) + logits_max_all.append(teacher_logit) + + # accumulate grads for all sub-networks + for depth_mult in sorted(args.depth_mult_list, reverse=True): + model.apply(lambda m: setattr(m, 'depth_mult', depth_mult)) + # select teacher model layers for matching + if args.training_phase == 'dynabert' or 'final_finetuning': + model = model.module if hasattr(model, 'module') else model + base_model = getattr(model, model.base_model_prefix, model) + n_layers = base_model.config.num_hidden_layers + depth = round(depth_mult * n_layers) + kept_layers_index = [] + for i in range(depth): + kept_layers_index.append(math.floor(i / depth_mult)) + kept_layers_index.append(n_layers) + + # adjust width + width_idx = 0 + for width_mult in sorted(args.width_mult_list, reverse=True): + model.apply(lambda m: setattr(m, 'width_mult', width_mult)) + # stage 1: width-adaptive + if args.training_phase == 'dynabertw': + loss, student_logit, student_reps, _, _ = model(**inputs) + + # distillation loss of logits + if args.output_mode == "classification": + logit_loss = soft_cross_entropy(student_logit, teacher_logit.detach()) + elif args.output_mode == "regression": + logit_loss = 0 + + # distillation loss of hidden states + rep_loss = 0 + for student_rep, teacher_rep in zip(student_reps, teacher_reps): + tmp_loss = loss_mse(student_rep, teacher_rep.detach()) + rep_loss += tmp_loss + + loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss + + # stage 2: width- and depth- adaptive + elif args.training_phase == 'dynabert': + loss, student_logit, student_reps, _, _ = model(**inputs) + + # distillation loss of logits + if args.output_mode == "classification": + logit_loss = soft_cross_entropy(student_logit, logits_max_all[width_idx].detach()) + elif args.output_mode == "regression": + logit_loss = 0 + + # distillation loss of hidden states + rep_loss = 0 + for student_rep, teacher_rep in zip( + student_reps, list(hidden_max_all[width_idx][i] for i in kept_layers_index)): + tmp_loss = loss_mse(student_rep, teacher_rep.detach()) + rep_loss += tmp_loss + + loss = args.depth_lambda1 * logit_loss + args.depth_lambda2 * rep_loss # ground+truth and distillation + width_idx += 1 # move to the next width + + # stage 3: final finetuning + else: + loss = model(**inputs)[0] + + print(loss) + if args.n_gpu > 1: + loss = loss.mean() + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + loss.backward() + + # clip the accumulated grad from all widths + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + tr_loss += loss.item() + if (step + 1) % args.gradient_accumulation_steps == 0: + optimizer.step() + scheduler.step() # Update learning rate schedule + model.zero_grad() + global_step += 1 + + if 0 < t_total < global_step: + epoch_iterator.close() + break + + # evaluate + if args.evaluate_during_training: + acc = [] + + # collect performance of all sub-networks + for depth_mult in sorted(args.depth_mult_list, reverse=True): + model.apply(lambda m: setattr(m, 'depth_mult', depth_mult)) + for width_mult in sorted(args.width_mult_list, reverse=True): + model.apply(lambda m: setattr(m, 'width_mult', width_mult)) + results = evaluate(args, model, tokenizer) + + logger.info("********** start evaluate results *********") + logger.info("depth_mult: %s ", depth_mult) + logger.info("width_mult: %s ", width_mult) + logger.info("results: %s ", results) + logger.info("********** end evaluate results *********") + + acc.append(list(results.values())[0]) + + result_to_save = dict() + result_to_save['epoch'] = epoch + 1 + result_to_save['global_step'] = global_step + result_to_save['loss'] = loss + result_to_save['acc'] = acc + + result_to_text_file(result_to_save, output_eval_file) + + # save model + if sum(acc) > current_best: + current_best = sum(acc) + + print("***best***{}\n".format(acc)) + with open(output_eval_file, "a") as writer: + writer.write("{}\n".format(acc)) + + logger.info("Saving model checkpoint to %s", args.output_dir) + model_to_save = model.module if hasattr(model, 'module') else model + model_to_save.save_pretrained(args.output_dir) + torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + model_to_save.config.to_json_file(os.path.join(args.output_dir, CONFIG_NAME)) + tokenizer.save_vocabulary(args.output_dir) + + if 0 < t_total < global_step: + train_iterator.close() + break + + return global_step, tr_loss / global_step + + +def evaluate(args, model, tokenizer, prefix=""): + """ Evaluate the model """ + results = {} + + eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + eval_sampler = SequentialSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, + batch_size=args.eval_batch_size) + + eval_loss = 0.0 + nb_eval_steps = 0 + preds = None + out_label_ids = None + for batch in tqdm(eval_dataloader, desc="Evaluating"): + model.eval() + batch = tuple(t.to(args.device) for t in batch) + + with torch.no_grad(): + inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None + outputs = model(**inputs) + tmp_eval_loss, logits = outputs[:2] + eval_loss += tmp_eval_loss.mean().item() + + nb_eval_steps += 1 + if preds is None: + preds = logits.detach().cpu().numpy() + out_label_ids = inputs['labels'].detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + + if args.output_mode == "regression": + preds = np.squeeze(preds) + + result = compute_metrics(args.task_name, preds, out_label_ids) + results.update(result) + + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + with open(output_eval_file, "a") as writer: + logger.info("***** Eval results {} *****".format(prefix)) + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("\n") + + return results + + +def load_and_cache_examples(args, task, tokenizer, evaluate=False): + _, lang, domain, kind = task.split('_') + processor = MultiemoProcessor(lang, domain, kind) + output_mode = multiemo_output_modes['multiemo'] + logger.info("Creating features from dataset file at %s", args.data_dir) + label_list = processor.get_labels() + if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: + label_list[1], label_list[2] = label_list[2], label_list[1] + examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) + if not evaluate and args.data_aug: + examples_aug = processor.get_train_examples_aug(args.data_dir) + examples = examples + examples_aug + features = convert_examples_to_features( + examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, + ) + + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + if output_mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif output_mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) + return dataset + + +def compute_neuron_head_importance(args, model, tokenizer): + """ This method shows how to compute: + - neuron importance scores based on loss according to http://arxiv.org/abs/1905.10650 + """ + # prepare things for heads + model = model.module if hasattr(model, 'module') else model + base_model = getattr(model, model.base_model_prefix, model) + n_layers, n_heads = base_model.config.num_hidden_layers, base_model.config.num_attention_heads + head_importance = torch.zeros(n_layers, n_heads).to(args.device) + head_mask = torch.ones(n_layers, n_heads).to(args.device) + head_mask.requires_grad_(requires_grad=True) + + # collect weights + intermediate_weight = [] + intermediate_bias = [] + output_weight = [] + for name, w in model.named_parameters(): + if 'intermediate' in name: + if w.dim() > 1: + intermediate_weight.append(w) + else: + intermediate_bias.append(w) + + if 'output' in name and 'attention' not in name: + if w.dim() > 1: + output_weight.append(w) + + neuron_importance = [] + for w in intermediate_weight: + neuron_importance.append(torch.zeros(w.shape[0]).to(args.device)) + + model.to(args.device) + + eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) + + args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + eval_sampler = SequentialSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + + for batch in tqdm(eval_dataloader, desc="Evaluating"): + batch = tuple(t.to(args.device) for t in batch) + input_ids, input_mask, _, label_ids = batch + segment_ids = batch[2] if args.model_type == 'bert' else None # RoBERTa does't use segment_ids + + # calculate head importance + outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, + head_mask=head_mask) + loss = outputs[0] + loss.backward() + head_importance += head_mask.grad.abs().detach() + + # calculate neuron importance + for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight, + neuron_importance): + current_importance += ((w1 * w1.grad).sum(dim=1) + b1 * b1.grad).abs().detach() + current_importance += ((w2 * w2.grad).sum(dim=0)).abs().detach() + + return head_importance, neuron_importance + + +def reorder_neuron_head(model, head_importance, neuron_importance): + """ reorder neurons based on their importance. + + Arguments: + model: bert model + head_importance: 12*12 matrix for head importance in 12 layers + neuron_importance: list for neuron importance in 12 layers. + """ + model = model.module if hasattr(model, 'module') else model + base_model = getattr(model, model.base_model_prefix, model) + + # reorder heads and ffn neurons + for layer, current_importance in enumerate(neuron_importance): + # reorder heads + idx = torch.sort(head_importance[layer], descending=True)[-1] + base_model.encoder.layer[layer].attention.reorder_heads(idx) + # reorder neurons + idx = torch.sort(current_importance, descending=True)[-1] + base_model.encoder.layer[layer].intermediate.reorder_neurons(idx) + base_model.encoder.layer[layer].output.reorder_neurons(idx) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument("--data_dir", default=None, type=str, required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_dir", default=None, type=str, required=True, + help="The student (and teacher) model dir.") + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where trained model is saved.") + parser.add_argument("--model_type", default=None, type=str, required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + parser.add_argument("--task_name", default=None, type=str, required=True, + help="The name of the multiemo task to train") + parser.add_argument("--max_seq_length", default=128, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.") + parser.add_argument("--do_train", action='store_true', + help="Whether to run training.") + parser.add_argument("--evaluate_during_training", default=True, + help="Rul evaluation during training at each logging step.") + parser.add_argument("--do_lower_case", default=True, + help="Set this flag if you are using an uncased model.") + parser.add_argument("--per_gpu_train_batch_size", default=32, type=int, + help="Batch size per GPU/CPU for training.") + parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for evaluation.") + parser.add_argument('--gradient_accumulation_steps', type=int, default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + parser.add_argument("--learning_rate", default=2e-5, type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, + help="Weight deay if we apply some.") + parser.add_argument("--num_train_epochs", default=3.0, type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_steps", default=0, type=int, + help="Linear warmup over warmup_steps.") + parser.add_argument('--logging_steps', type=int, default=50, + help="Log every X updates steps.") + parser.add_argument('--seed', type=int, default=42, + help="random seed for initialization") + parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, + help="dropout rate on hidden states.") + parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, + help="dropout rate on attention probs.") + + parser.add_argument('--data_aug', action='store_true', help="whether using data augmentation") + # for depth direction + parser.add_argument('--depth_mult_list', type=str, default='1.', + help="the possible depths used for training, e.g., '1.' is for default") + parser.add_argument("--depth_lambda1", default=1.0, type=float, + help="logit matching coef.") + parser.add_argument("--depth_lambda2", default=1.0, type=float, + help="hidden states matching coef.") + # for width direction + parser.add_argument('--width_mult_list', type=str, default='1.', + help="the possible widths used for training, e.g., '1.' is for separate training " + "while '0.25,0.5,0.75,1.0' is for vanilla slimmable training") + parser.add_argument("--width_lambda1", default=1.0, type=float, + help="logit matching coef.") + parser.add_argument("--width_lambda2", default=0.1, type=float, + help="hidden states matching coef.") + + parser.add_argument("--training_phase", default="dynabertw", type=str, + help="can be finetuning, dynabertw, dynabert, final_finetuning") + + args = parser.parse_args() + + args.width_mult_list = [float(width) for width in args.width_mult_list.split(',')] + args.depth_mult_list = [float(depth) for depth in args.depth_mult_list.split(',')] + + # Setup CUDA, GPU & distributed training + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + args.n_gpu = torch.cuda.device_count() + args.device = device + + # Set seed + set_seed(args) + + # Prepare MULTIEMO task: provide num_labels here + args.task_name = args.task_name.lower() + if 'multiemo' not in args.task_name: + raise ValueError("Task not found: %s" % args.task_name) + _, lang, domain, kind = args.task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + args.output_mode = multiemo_output_modes['multiemo'] + label_list = processor.get_labels() + num_labels = len(label_list) + + # prepare model, tokernizer and config + args.model_type = args.model_type.lower() + config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + config = config_class.from_pretrained(args.model_dir, num_labels=num_labels, finetuning_task=args.task_name) + config.output_attentions, config.output_hidden_states, config.output_intermediate = True, True, True + tokenizer = tokenizer_class.from_pretrained(args.model_dir, do_lower_case=args.do_lower_case) + + # load teacher model if necessary + if args.training_phase == 'dynabertw' or args.training_phase == 'dynabert': + teacher_model = model_class.from_pretrained(args.model_dir, config=config) + teacher_model.to(args.device) + else: + teacher_model = None + + # load student model if necessary + model = model_class.from_pretrained(args.model_dir, config=config) + + if args.training_phase == 'dynabertw': + # rewire the network according to the importance of attention heads and neurons + head_importance, neuron_importance = compute_neuron_head_importance(args, model, tokenizer) + reorder_neuron_head(model, head_importance, neuron_importance) + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + model.to(args.device) + + logger.info("Training/evaluation parameters %s", args) + + # Training + if args.do_train: + training_start_time = time.monotonic() + + train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) + if teacher_model: + global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher_model) + else: + global_step, tr_loss = train(args, train_dataset, model, tokenizer) + + logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) + + # Measure End Time + training_end_time = time.monotonic() + + diff = timedelta(seconds=training_end_time - training_start_time) + diff_seconds = diff.total_seconds() + + training_parameters = vars(args) + training_parameters['training_time'] = diff_seconds + + output_training_params_file = os.path.join(args.output_dir, "training_params.json") + dictionary_to_json(training_parameters, output_training_params_file) + + +if __name__ == "__main__": + main() diff --git a/DynaBERT/scripts/download_dataset.py b/DynaBERT/scripts/download_dataset.py new file mode 100644 index 00000000..701ffd3a --- /dev/null +++ b/DynaBERT/scripts/download_dataset.py @@ -0,0 +1,55 @@ +import argparse +import os +import zipfile + +import requests +from tqdm.auto import tqdm + +# url = 'https://clarin-pl.eu/dspace/bitstream/handle/11321/798/multiemo.zip?sequence=2&isAllowed=y' +url = 'https://clarin-pl.eu/dspace/handle/11321/798/allzip' + + +def main(data_dir): + output_zip = os.path.join( + data_dir, + 'MultiEmo_ Multilingual, Multilevel, Multidomain Sentiment Analysis Corpus of Consumer Reviews.zip') + + response = requests.get(url, stream=True) + + if response.status_code == 200: + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + with open(output_zip, 'wb') as f: + for chunk in response.iter_content(chunk_size=block_size): + if chunk: + progress_bar.update(len(chunk)) + f.write(chunk) + + progress_bar.close() + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + print("ERROR, something went wrong") + + with zipfile.ZipFile(output_zip, "r") as zip_ref: + zip_ref.extractall(data_dir) + + os.remove(output_zip) + os.remove(os.path.join(data_dir, 'multiemo.7z')) + + data_output_zip = os.path.join(data_dir, 'multiemo.zip') + with zipfile.ZipFile(data_output_zip, "r") as zip_ref: + zip_ref.extractall(data_dir) + + os.remove(data_output_zip) + os.remove(os.path.join(data_dir, 'README.txt')) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data_dir', help='directory to save data to', type=str, default='multiemo2') + args = parser.parse_args() + + if not os.path.isdir(args.data_dir): + os.mkdir(args.data_dir) + + main(data_dir=args.data_dir) diff --git a/DynaBERT/transformers/data/metrics/__init__.py b/DynaBERT/transformers/data/metrics/__init__.py index 942bd7b1..00d321ad 100644 --- a/DynaBERT/transformers/data/metrics/__init__.py +++ b/DynaBERT/transformers/data/metrics/__init__.py @@ -18,6 +18,8 @@ import sys import logging +import numpy as np + logger = logging.getLogger(__name__) try: diff --git a/DynaBERT/utils.py b/DynaBERT/utils.py new file mode 100644 index 00000000..5decae1e --- /dev/null +++ b/DynaBERT/utils.py @@ -0,0 +1,33 @@ +import json +import logging +import os +import sys + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%d/%m/%Y %H:%M:%S') +logger = logging.getLogger(__name__) + + +def result_to_text_file(result: dict, file_name: str, verbose: bool = True) -> None: + with open(file_name, "a") as writer: + for key in sorted(result.keys()): + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("") + + +def dictionary_to_json(dictionary: dict, file_name: str): + with open(file_name, "w") as f: + json.dump(dictionary, f, indent=2) + + +def is_folder_empty(folder_name: str): + if len([f for f in os.listdir(folder_name) if not f.startswith('.')]) == 0: + return True + else: + return False + + +def get_immediate_subdirectories(directory: str): + return [os.path.join(directory, name) for name in os.listdir(directory) + if os.path.isdir(os.path.join(directory, name))] \ No newline at end of file From a0fb1f2f4bb0200a2811c36b18049efe8e14da29 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 30 Nov 2021 16:20:36 +0100 Subject: [PATCH 43/62] feat: add evaluation on test dataset --- DynaBERT/eval_multiemo.py | 372 ++++++++++++++++++ .../transformers/data/processors/multiemo.py | 25 +- 2 files changed, 386 insertions(+), 11 deletions(-) create mode 100644 DynaBERT/eval_multiemo.py diff --git a/DynaBERT/eval_multiemo.py b/DynaBERT/eval_multiemo.py new file mode 100644 index 00000000..ead563b9 --- /dev/null +++ b/DynaBERT/eval_multiemo.py @@ -0,0 +1,372 @@ +# coding=utf-8 +# 2020.08.28 - Changed regular evaluation to evaluation with adaptive width and depth +# Huawei Technologies Co., Ltd +# Copyright (c) 2020, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa).""" + +from __future__ import absolute_import, division, print_function + +import argparse +import logging +import os +import random +import time +from datetime import timedelta + +import numpy as np +import torch +from sklearn.metrics import classification_report +from torch.utils.data import (DataLoader, SequentialSampler, TensorDataset) +from tqdm import tqdm + +from transformers import (BertConfig, + BertForSequenceClassification, BertTokenizer, + RobertaConfig, + RobertaForSequenceClassification, + RobertaTokenizer, + ) + +from transformers.data.metrics import multiemo_compute_metrics as compute_metrics +from transformers.data.processors.multiemo import multiemo_convert_examples_to_features as convert_examples_to_features, \ + MultiemoProcessor, multiemo_output_modes +from utils import dictionary_to_json + +logger = logging.getLogger(__name__) + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.seq_length = seq_length + self.label_id = label_id + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def convert_examples_to_features_test(examples, label_list, max_seq_length, + tokenizer, output_mode): + """Loads a data file into a list of `InputBatch`s.""" + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:(max_seq_length - 2)] + + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + segment_ids = [0] * len(tokens) + + if tokens_b: + tokens += tokens_b + ["[SEP]"] + segment_ids += [1] * (len(tokens_b) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + seq_length = len(input_ids) + + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + try: + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = float(example.label) + else: + raise KeyError(output_mode) + except: + label_id = 0 + + if ex_index < 1: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join( + [str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("label: {}".format(example.label)) + logger.info("label_id: {}".format(label_id)) + + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + seq_length=seq_length)) + return features + + +MODEL_CLASSES = { + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), + 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), +} + + +def set_seed(args): + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if args.n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + +def evaluate(args, model, tokenizer, prefix=""): + results = {} + eval_dataset = load_and_cache_examples_test(args, args.task_name, tokenizer) + + eval_output_dir = os.path.join( + args.output_dir, args.model_type + '_' + args.width_mult + '_' + args.depth_mult + '_eval') + + if not os.path.exists(eval_output_dir): + # and args.local_rank in [-1, 0]: + os.makedirs(eval_output_dir) + + args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + eval_sampler = SequentialSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + + logger.info("***** Running evaluation {} *****".format(prefix)) + logger.info(" Num examples = %d", len(eval_dataset)) + logger.info(" Batch size = %d", args.eval_batch_size) + eval_loss = 0.0 + nb_eval_steps = 0 + preds = None + out_label_ids = None + for batch in tqdm(eval_dataloader, desc="Evaluating"): + model.eval() + batch = tuple(t.to(args.device) for t in batch) + + with torch.no_grad(): + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'labels': batch[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] \ + else None # XLM, DistilBERT and RoBERTa don't use segment_ids + outputs = model(**inputs) + + tmp_eval_loss, logits = outputs[:2] + eval_loss += tmp_eval_loss.mean().item() + + nb_eval_steps += 1 + if preds is None: + preds = logits.detach().cpu().numpy() + out_label_ids = inputs['labels'].detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + + if args.output_mode == "regression": + preds = np.squeeze(preds) + + result = compute_metrics(args.task_name, preds, out_label_ids) + results.update(result) + output_eval_file = os.path.join(eval_output_dir, "test_results_{0}.txt".format(args.task_name)) + + with open(output_eval_file, "a") as writer: + logger.info("***** Eval results {} *****".format(prefix)) + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("\n") + + return results, preds, out_label_ids + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_label_ids, all_seq_lengths) + return tensor_data, all_label_ids + + +def load_and_cache_examples_test(args, task, tokenizer): + _, lang, domain, kind = args.task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + output_mode = multiemo_output_modes['multiemo'] + label_list = processor.get_labels() + + examples = processor.get_test_examples(args.data_dir) + features = convert_examples_to_features_test(examples, label_list, args.max_seq_length, tokenizer, output_mode) + data, labels = get_tensor_data(output_mode, features) + return data, label_list + + +def load_and_cache_examples(args, task, tokenizer, evaluate=False): + _, lang, domain, kind = args.task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + output_mode = multiemo_output_modes['multiemo'] + + logger.info("Creating features from dataset file at %s", args.data_dir) + label_list = processor.get_labels() + + examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) + + features = convert_examples_to_features( + examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, + ) + + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + if output_mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif output_mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) + return dataset + + +def main(): + parser = argparse.ArgumentParser() + + # Required parameters + parser.add_argument("--data_dir", default=None, type=str, required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_type", default=None, type=str, required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + parser.add_argument("--task_name", default=None, type=str, required=True, + help="The name of the task to train selected") + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where the model predictions will be written.") + parser.add_argument("--max_seq_length", default=128, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.") + parser.add_argument("--do_lower_case", default=True, + help="Set this flag if you are using an uncased model.") + parser.add_argument("--per_gpu_eval_batch_size", default=128, type=int, + help="Batch size per GPU/CPU for evaluation.") + parser.add_argument("--no_cuda", action='store_true', + help="Avoid using CUDA when available") + parser.add_argument('--seed', type=int, default=42, + help="random seed for initialization") + parser.add_argument("--model_dir", type=str, + help="The teacher model dir.") + parser.add_argument('--depth_mult', type=str, default='1.', + help="the possible depths used for training, e.g., '1.' is for default") + parser.add_argument('--width_mult', type=str, default='1.', + help="the possible depths used for training, e.g., '1.' is for default") + + args = parser.parse_args() + args.model_dir = os.path.join(args.model_dir, 'best') + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + args.n_gpu = torch.cuda.device_count() + args.device = device + + # Setup logging + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) + logger.warning("device: %s, n_gpu: %s", device, args.n_gpu, ) + + # Set seed + set_seed(args) + + # Prepare MULTIEMO task: provide num_labels here + args.task_name = args.task_name.lower() + if 'multiemo' not in args.task_name: + raise ValueError("Task not found: %s" % args.task_name) + + _, lang, domain, kind = args.task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + args.output_mode = multiemo_output_modes['multiemo'] + label_list = processor.get_labels() + num_labels = len(label_list) + + args.model_type = args.model_type.lower() + config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + + config = config_class.from_pretrained(args.model_dir, num_labels=num_labels, finetuning_task=args.task_name) + tokenizer = tokenizer_class.from_pretrained(args.model_dir, do_lower_case=args.do_lower_case) + model = model_class.from_pretrained(args.model_dir, config=config) + model.to(args.device) + model.apply(lambda m: setattr(m, 'depth_mult', float(args.depth_mult))) + model.apply(lambda m: setattr(m, 'width_mult', float(args.width_mult))) + + eval_start_time = time.monotonic() + + results, y_logits, y_true = evaluate(args, model, tokenizer) + print(results) + + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(y_true, y_pred, target_names=label_list)) + + report = classification_report(y_true, y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + + eval_output_dir = os.path.join( + args.output_dir, args.model_type + '_' + args.width_mult + '_' + args.depth_mult + '_eval') + dictionary_to_json(report, os.path.join(eval_output_dir, "test_results.json")) + + +if __name__ == "__main__": + main() diff --git a/DynaBERT/transformers/data/processors/multiemo.py b/DynaBERT/transformers/data/processors/multiemo.py index be4a0eb2..e7cb3142 100644 --- a/DynaBERT/transformers/data/processors/multiemo.py +++ b/DynaBERT/transformers/data/processors/multiemo.py @@ -17,6 +17,8 @@ import logging import os +from typing import List + import numpy as np from .utils import DataProcessor, InputExample, InputFeatures from ...file_utils import is_tf_available @@ -27,15 +29,16 @@ logger = logging.getLogger(__name__) -def multiemo_convert_examples_to_features(examples, tokenizer, - max_length=512, - task=None, - label_list=None, - output_mode=None, - pad_on_left=False, - pad_token=0, - pad_token_segment_id=0, - mask_padding_with_zero=True): +def multiemo_convert_examples_to_features( + examples, tokenizer, + max_length=512, + task=None, + label_list=None, + output_mode=None, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True): """ Loads a data file into a list of ``InputFeatures`` @@ -61,13 +64,13 @@ def multiemo_convert_examples_to_features(examples, tokenizer, """ if task is not None: - _, lang, domain, kind = task_name.split('_') + _, lang, domain, kind = task.split('_') processor = MultiemoProcessor(lang, domain, kind) if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: - output_mode = multiemo_output_modes_output_modes[task] + output_mode = multiemo_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} From 0fc35b6354c236af984f9ae9e432466e0ee43ab9 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 30 Nov 2021 17:03:48 +0100 Subject: [PATCH 44/62] feat: add script running experiments --- DynaBERT/run_experiments.py | 149 ++++++++++++++++++++++++++++++++++++ DynaBERT/run_multiemo.py | 2 - 2 files changed, 149 insertions(+), 2 deletions(-) create mode 100644 DynaBERT/run_experiments.py diff --git a/DynaBERT/run_experiments.py b/DynaBERT/run_experiments.py new file mode 100644 index 00000000..6f821b19 --- /dev/null +++ b/DynaBERT/run_experiments.py @@ -0,0 +1,149 @@ +import logging +import os +import sys + +PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__)) +DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data') + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%d/%m/%Y %H:%M:%S') +logger = logging.getLogger(__name__) + +data_dir = os.path.join('data', 'multiemo2') + +batch_size = 16 +num_train_epochs = 3 +learning_rate = 5e-5 +weight_decay = 0.01 + + +def main(): + print(PROJECT_FOLDER) + os.chdir(PROJECT_FOLDER) + + if not os.path.exists(os.path.join(DATA_FOLDER, 'multiemo2')): + logger.info("Downloading Multiemo data") + cmd = 'python3 scripts/download_dataset.py --data_dir data/multiemo2' + run_process(cmd) + logger.info("Downloading finished") + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')): + logger.info("Downloading bert-base-uncased model") + cmd = 'python3 download_bert_base.py' + run_process(cmd) + logger.info("Downloading finished") + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased', 'multiemo_en_all_sentence')): + cmd = 'python3 multiemo_fine_tune_bert.py ' + options = [ + '--pretrained_model', 'data/models/bert-base-uncased', + '--data_dir', 'data/multiemo2', + '--task_name', 'multiemo_en_all_sentence', + '--output_dir', 'data/models/bert-base-uncased/multiemo_en_all_sentence', + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--weight_decay', str(weight_decay), + '--train_batch_size', str(batch_size), + '--do_lower_case' + ] + cmd += ' '.join(options) + logger.info(f"Training bert-base-uncased for multiemo_en_all_sentence") + run_process(cmd) + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'dynabertw', 'multiemo_en_all_sentence')): + cmd = 'python3 run_multiemo.py ' + options = [ + '--model_type', 'bert', + '--task_name', 'multiemo_en_all_sentence', + '--do_train', + '--data_dir', 'data/multiemo2', + '--model_dir ', 'data/models/bert-base-uncased/multiemo_en_all_sentence', + '--output_dir', 'data/models/dynabertw/multiemo_en_all_sentence', + '--max_seq_length', str(128), + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--per_gpu_train_batch_size', str(batch_size), + '--weight_decay', str(weight_decay), + '--width_mult_list', '0.25,0.5,0.75,1.0', + '--width_lambda1', str(1.0), + '--width_lambda2', str(0.1), + '--training_phase', 'dynabertw' + ] + cmd += ' '.join(options) + logger.info(f"Training DynaBERT_W for multiemo_en_all_sentence") + run_process(cmd) + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'dynabert', 'multiemo_en_all_sentence')): + cmd = 'python3 run_multiemo.py ' + options = [ + '--model_type', 'bert', + '--task_name', 'multiemo_en_all_sentence', + '--do_train', + '--data_dir', 'data/multiemo2', + '--model_dir ', 'data/models/dynabertw/multiemo_en_all_sentence', + '--output_dir', 'data/models/dynabert/multiemo_en_all_sentence', + '--max_seq_length', str(128), + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--per_gpu_train_batch_size', str(batch_size), + '--weight_decay', str(weight_decay), + '--width_mult_list', '0.25,0.5,0.75,1.0', + '--depth_mult_list', '0.5,0.75,1.0', + '--width_lambda1', str(1.0), + '--width_lambda2', str(1.0), + '--training_phase', 'dynabert', + ] + cmd += ' '.join(options) + logger.info(f"Training DynaBERT for multiemo_en_all_sentence") + run_process(cmd) + + if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'dynabert-finetuned', 'multiemo_en_all_sentence')): + cmd = 'python3 run_multiemo.py ' + options = [ + '--model_type', 'bert', + '--task_name', 'multiemo_en_all_sentence', + '--do_train', + '--data_dir', 'data/multiemo2', + '--model_dir ', 'data/models/dynabertw/multiemo_en_all_sentence', + '--output_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence', + '--max_seq_length', str(128), + '--learning_rate', str(learning_rate), + '--num_train_epochs', str(num_train_epochs), + '--per_gpu_train_batch_size', str(batch_size), + '--weight_decay', str(weight_decay), + '--width_mult_list', '0.25,0.5,0.75,1.0', + '--depth_mult_list', '0.5,0.75,1.0', + '--training_phase', 'final_finetuning ', + ] + cmd += ' '.join(options) + logger.info(f"Finetuning DynaBERT for multiemo_en_all_sentence") + run_process(cmd) + + + cmd = 'python3 eval_multiemo.py ' + options = [ + '--model_type', 'bert', + '--task_name', 'multiemo_en_all_sentence', + '--data_dir', 'data/multiemo2', + '--model_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence' + '--output_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence' + '--max_seq_length', str(128), + '--depth_mult', '0.5' + ] + cmd += ' '.join(options) + logger.info(f"Evaluating DynaBERT for multiemo_en_all_sentence") + run_process(cmd) + + + # cmd = f'python3 -m gather_results --task_name multiemo_en_all_sentence' + # logger.info(f"Gathering results to csv for multiemo_en_all_sentence") + # run_process(cmd) + + +def run_process(proc): + os.system(proc) + + +if __name__ == '__main__': + main() diff --git a/DynaBERT/run_multiemo.py b/DynaBERT/run_multiemo.py index 3eb3bd05..30cb4c28 100644 --- a/DynaBERT/run_multiemo.py +++ b/DynaBERT/run_multiemo.py @@ -467,8 +467,6 @@ def main(): help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, From 045ababe671b3017a4e7d21ae66a81784426c481 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 30 Nov 2021 19:06:37 +0100 Subject: [PATCH 45/62] feat: fix typing --- DynaBERT/transformers/data/processors/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/DynaBERT/transformers/data/processors/utils.py b/DynaBERT/transformers/data/processors/utils.py index 1556ef02..d1178364 100644 --- a/DynaBERT/transformers/data/processors/utils.py +++ b/DynaBERT/transformers/data/processors/utils.py @@ -18,6 +18,7 @@ import sys import copy import json +from typing import List class InputExample(object): From b82f0d31b0e83f7e388648cf075a89369bf87f80 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 30 Nov 2021 19:23:11 +0100 Subject: [PATCH 46/62] feat: add regex to requirements --- TernaryBERT/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt index 7dc69627..52e38093 100644 --- a/TernaryBERT/requirements.txt +++ b/TernaryBERT/requirements.txt @@ -8,4 +8,5 @@ numpy~=1.21.2 pandas~=1.3.3 scikit-learn~=1.0 tqdm +regex # torch==1.1.0 From 76c9611b3c5d409e94a20f949ba80b08cf13f326 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 30 Nov 2021 19:47:23 +0100 Subject: [PATCH 47/62] feat: add minor corrects --- DynaBERT/run_multiemo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/DynaBERT/run_multiemo.py b/DynaBERT/run_multiemo.py index 30cb4c28..9d1459c0 100644 --- a/DynaBERT/run_multiemo.py +++ b/DynaBERT/run_multiemo.py @@ -189,7 +189,6 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None): else: loss = model(**inputs)[0] - print(loss) if args.n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: @@ -388,7 +387,7 @@ def compute_neuron_head_importance(args, model, tokenizer): eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - for batch in tqdm(eval_dataloader, desc="Evaluating"): + for batch in tqdm(eval_dataloader, desc="Evaluating for determining importance"): batch = tuple(t.to(args.device) for t in batch) input_ids, input_mask, _, label_ids = batch segment_ids = batch[2] if args.model_type == 'bert' else None # RoBERTa does't use segment_ids From 3f7546bdf8915c9b5c241e5b03edf74699e3ff32 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 30 Nov 2021 20:44:49 +0100 Subject: [PATCH 48/62] feat: add skilearn imports --- DynaBERT/transformers/data/metrics/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DynaBERT/transformers/data/metrics/__init__.py b/DynaBERT/transformers/data/metrics/__init__.py index 00d321ad..8893d540 100644 --- a/DynaBERT/transformers/data/metrics/__init__.py +++ b/DynaBERT/transformers/data/metrics/__init__.py @@ -24,7 +24,8 @@ try: from scipy.stats import pearsonr, spearmanr - from sklearn.metrics import matthews_corrcoef, f1_score + from sklearn.metrics import matthews_corrcoef, f1_score, accuracy_score + _has_sklearn = True except (AttributeError, ImportError) as e: logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html") From 8f21d8e74a29166ecfd12b853fff2d8ec3cceef8 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Tue, 30 Nov 2021 22:49:11 +0100 Subject: [PATCH 49/62] feat: coorect sabing results --- DynaBERT/run_multiemo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DynaBERT/run_multiemo.py b/DynaBERT/run_multiemo.py index 9d1459c0..bf5d7d08 100644 --- a/DynaBERT/run_multiemo.py +++ b/DynaBERT/run_multiemo.py @@ -241,8 +241,6 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None): current_best = sum(acc) print("***best***{}\n".format(acc)) - with open(output_eval_file, "a") as writer: - writer.write("{}\n".format(acc)) logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr(model, 'module') else model @@ -567,6 +565,8 @@ def main(): training_parameters['training_time'] = diff_seconds output_training_params_file = os.path.join(args.output_dir, "training_params.json") + + training_parameters.pop('device') dictionary_to_json(training_parameters, output_training_params_file) From ba9908ea8ac7d3b81fcf85063c1a4de450b56981 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 1 Dec 2021 09:08:18 +0100 Subject: [PATCH 50/62] feat: coorect arguments of evaluating --- DynaBERT/run_experiments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DynaBERT/run_experiments.py b/DynaBERT/run_experiments.py index 6f821b19..3a1b448f 100644 --- a/DynaBERT/run_experiments.py +++ b/DynaBERT/run_experiments.py @@ -126,8 +126,8 @@ def main(): '--model_type', 'bert', '--task_name', 'multiemo_en_all_sentence', '--data_dir', 'data/multiemo2', - '--model_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence' - '--output_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence' + '--model_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence', + '--output_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence', '--max_seq_length', str(128), '--depth_mult', '0.5' ] From d62c3038e7088e191f7b22aa530e4221cd83142a Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 1 Dec 2021 09:10:57 +0100 Subject: [PATCH 51/62] feat: correct model dir --- DynaBERT/eval_multiemo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DynaBERT/eval_multiemo.py b/DynaBERT/eval_multiemo.py index ead563b9..f589e318 100644 --- a/DynaBERT/eval_multiemo.py +++ b/DynaBERT/eval_multiemo.py @@ -311,7 +311,7 @@ def main(): help="the possible depths used for training, e.g., '1.' is for default") args = parser.parse_args() - args.model_dir = os.path.join(args.model_dir, 'best') + # args.model_dir = os.path.join(args.model_dir, 'best') device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device From 1938cb438b80243fba689d6d1f3d78b1ba7dac9a Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Wed, 1 Dec 2021 09:21:15 +0100 Subject: [PATCH 52/62] feat: correct loading test data --- DynaBERT/eval_multiemo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DynaBERT/eval_multiemo.py b/DynaBERT/eval_multiemo.py index f589e318..b78b9bc4 100644 --- a/DynaBERT/eval_multiemo.py +++ b/DynaBERT/eval_multiemo.py @@ -158,7 +158,7 @@ def set_seed(args): def evaluate(args, model, tokenizer, prefix=""): results = {} - eval_dataset = load_and_cache_examples_test(args, args.task_name, tokenizer) + eval_dataset, _ = load_and_cache_examples_test(args, args.task_name, tokenizer) eval_output_dir = os.path.join( args.output_dir, args.model_type + '_' + args.width_mult + '_' + args.depth_mult + '_eval') From c112a595246334b060444a0811d21485d2ef0e1f Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Thu, 9 Dec 2021 14:28:15 +0100 Subject: [PATCH 53/62] feat: add gathering results for ternarybert --- TernaryBERT/eval_quant_multiemo.py | 230 +++++++++++++++++++++++++++++ TernaryBERT/gather_results.py | 102 +++++++++++++ TernaryBERT/quant_task_multiemo.py | 63 +++++--- TernaryBERT/run_experiments.py | 17 +++ 4 files changed, 391 insertions(+), 21 deletions(-) create mode 100644 TernaryBERT/eval_quant_multiemo.py create mode 100644 TernaryBERT/gather_results.py diff --git a/TernaryBERT/eval_quant_multiemo.py b/TernaryBERT/eval_quant_multiemo.py new file mode 100644 index 00000000..0312ecee --- /dev/null +++ b/TernaryBERT/eval_quant_multiemo.py @@ -0,0 +1,230 @@ +from __future__ import absolute_import, division, print_function + +import argparse +import random +import time +from datetime import timedelta + +import torch +from torch.utils.data import DataLoader, SequentialSampler, TensorDataset +from torch.nn import CrossEntropyLoss, MSELoss +from sklearn.metrics import classification_report +from tqdm import tqdm + +from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification +from transformer import BertTokenizer +from transformer import BertConfig +from utils_multiemo import * +from utils import dictionary_to_json, result_to_text_file + +log_format = '%(asctime)s %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt='%m/%d %I:%M:%S %p') +logger = logging.getLogger() + + +def get_tensor_data(output_mode, features): + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + + all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long) + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths) + return tensor_data, all_label_ids + + +def do_eval(model, task_name, eval_dataloader, + device, output_mode, eval_labels, num_labels): + eval_loss = 0 + nb_eval_steps = 0 + all_logits = None + + for batch_ in tqdm(eval_dataloader): + batch_ = tuple(t.to(device) for t in batch_) + with torch.no_grad(): + input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_ + logits, _, _ = model(input_ids, segment_ids, input_mask) + + # create eval loss and other metric required by the task + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + + if all_logits is None: + all_logits = logits.detach().cpu().numpy() + else: + all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + + if output_mode == "regression": + all_logits = np.squeeze(all_logits) + result = compute_metrics(task_name, all_logits, eval_labels.numpy()) + result['eval_loss'] = eval_loss + return result, all_logits + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", + default='data', + type=str, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_dir", + default='models/tinybert', + type=str, + help="The model dir.") + parser.add_argument("--task_name", + type=str, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default='output', + type=str, + help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + + parser.add_argument("--weight_bits", + default=2, + type=int, + choices=[2, 8], + help="Quantization bits for weight.") + parser.add_argument("--input_bits", + default=8, + type=int, + help="Quantization bits for activation.") + parser.add_argument("--clip_val", + default=2.5, + type=float, + help="Initial clip value.") + + args = parser.parse_args() + task_name = args.task_name.lower() + data_dir = args.data_dir + + model_dir = os.path.join(args.model_dir, task_name) + output_dir = os.path.join(args.output_dir, task_name) + os.makedirs(output_dir, exist_ok=True) + + output_modes = { + "multiemo": "classification" + } + + default_params = { + "multiemo": {"max_seq_length": 128, "batch_size": 16} + } + + # Prepare devices + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + + # Prepare seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if task_name in default_params: + args.batch_size = default_params[task_name]["batch_size"] + if n_gpu > 0: + args.batch_size = int(args.batch_size * n_gpu) + args.max_seq_length = default_params[task_name]["max_seq_length"] + elif 'multiemo' in task_name: + args.batch_size = default_params['multiemo']["batch_size"] + if n_gpu > 0: + args.batch_size = int(args.batch_size * n_gpu) + args.max_seq_length = default_params['multiemo']["max_seq_length"] + + if 'multiemo' in task_name: + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + else: + raise ValueError("Task not found: %s" % task_name) + + if 'multiemo' in task_name: + output_mode = output_modes['multiemo'] + else: + raise ValueError("Task not found: %s" % task_name) + + label_list = processor.get_labels() + num_labels = len(label_list) + + tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=args.do_lower_case) + + ######################### + # Test model # + ######################### + test_examples = processor.get_test_examples(data_dir) + test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer, + output_mode) + + test_data, test_labels = get_tensor_data(output_mode, test_features) + test_sampler = SequentialSampler(test_data) + test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.batch_size) + + config = BertConfig.from_pretrained( + model_dir, + quantize_act=True, + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val + ) + model = QuantBertForSequenceClassification.from_pretrained(model_dir, config=config, num_labels=num_labels) + model.to(device) + + model_quant_dir = os.path.join(model_dir, 'quant') + qunat_config = BertConfig.from_pretrained( + model_quant_dir, + quantize_act=True, + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val + ) + quant_model = QuantBertForSequenceClassification.from_pretrained(model_quant_dir, config=qunat_config, + num_labels=num_labels) + quant_model.to(device) + + output_quant_dir = os.path.join(output_dir, 'quant') + for m, out_dir in zip([model, quant_model], [output_dir, output_quant_dir]): + logger.info("\n***** Running evaluation on test dataset *****") + logger.info(" Num examples = %d", len(test_features)) + logger.info(" Batch size = %d", args.batch_size) + + eval_start_time = time.monotonic() + m.eval() + result, y_logits = do_eval(m, task_name, test_dataloader, + device, output_mode, test_labels, num_labels) + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + result['eval_time'] = diff_seconds + result_to_text_file(result, os.path.join(out_dir, "test_results.txt")) + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(test_labels.numpy(), y_pred, target_names=label_list)) + + report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + dictionary_to_json(report, os.path.join(out_dir, "test_results.json")) + + +if __name__ == "__main__": + main() diff --git a/TernaryBERT/gather_results.py b/TernaryBERT/gather_results.py new file mode 100644 index 00000000..dd925b15 --- /dev/null +++ b/TernaryBERT/gather_results.py @@ -0,0 +1,102 @@ +import argparse +import json +import os +from typing import Any, Dict + +import pandas as pd + +from transformer import BertConfig +from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification +from utils_multiemo import MultiemoProcessor + +PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__)) +DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data') +MODELS_FOLDER = os.path.join(DATA_FOLDER, 'models', 'bert-of-theseus') + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + + args = parser.parse_args() + task_name = args.task_name + + models_subdirectories = get_immediate_subdirectories(MODELS_FOLDER) + print(MODELS_FOLDER) + + print(models_subdirectories) + data = list() + for subdirectory in models_subdirectories: + data_dict = gather_results(subdirectory, task_name) + data.append(data_dict) + + df = pd.DataFrame(data) + cols = df.columns.tolist() + cols = cols[-1:] + cols[:-1] + df = df[cols] + df.to_csv(os.path.join(DATA_FOLDER, 'results-ternarybert-' + task_name + '.csv'), index=False) + + +def get_immediate_subdirectories(a_dir): + return [os.path.join(a_dir, name) for name in os.listdir(a_dir) + if os.path.isdir(os.path.join(a_dir, name))] + + +def gather_results(model_dir: str, task_name: str) -> Dict[str, Any]: + quant_model_dir = os.path.join(model_dir, 'quant') + + with open(os.path.join(quant_model_dir, 'training_params.json')) as json_file: + training_data_dict = json.load(json_file) + + with open(os.path.join(quant_model_dir, 'test_results.json')) as json_file: + test_data = json.load(json_file) + [test_data_dict] = pd.json_normalize(test_data, sep='_').to_dict(orient='records') + + data = training_data_dict.copy() # start with keys and values of x + data.update(test_data_dict) + + model_size = os.path.getsize(os.path.join(quant_model_dir, 'pytorch_model.bin')) + data['model_size'] = model_size + + if 'multiemo' not in task_name: + raise ValueError("Task not found: %s" % task_name) + + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + label_list = processor.get_labels() + num_labels = len(label_list) + + # LOADING THE BEST MODEL + student_config = BertConfig.from_pretrained( + quant_model_dir, + quantize_act=True, + weight_bits=data['weight_bits'], + input_bits=data['input_bits'], + clip_val=data['clip_val'] + ) + model = QuantBertForSequenceClassification.from_pretrained(quant_model_dir, config=student_config, + num_labels=num_labels) + + memory_params = sum([param.nelement() * param.element_size() for param in model.parameters()]) + memory_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()]) + memory_used = memory_params + memory_buffers # in bytes + + data['memory'] = memory_used + + parameters_num = 0 + for n, p in model.named_parameters(): + parameters_num += p.nelement() + + data['parameters'] = parameters_num + data['name'] = os.path.basename(model_dir) + print(data) + + return data + + +if __name__ == '__main__': + main() diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py index 4a0c43b7..1726b13f 100644 --- a/TernaryBERT/quant_task_multiemo.py +++ b/TernaryBERT/quant_task_multiemo.py @@ -435,28 +435,49 @@ def main(): test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.batch_size) - logger.info("\n***** Running evaluation on test dataset *****") - logger.info(" Num examples = %d", len(test_features)) - logger.info(" Batch size = %d", args.batch_size) - - eval_start_time = time.monotonic() - student_model.eval() - result, y_logits = do_eval(student_model, task_name, test_dataloader, - device, output_mode, test_labels, num_labels) - eval_end_time = time.monotonic() - - diff = timedelta(seconds=eval_end_time - eval_start_time) - diff_seconds = diff.total_seconds() - result['eval_time'] = diff_seconds - result_to_text_file(result, os.path.join(output_dir, "test_results.txt")) - - y_pred = np.argmax(y_logits, axis=1) - print('\n\t**** Classification report ****\n') - print(classification_report(test_labels.numpy(), y_pred, target_names=label_list)) + config = BertConfig.from_pretrained( + output_dir, + quantize_act=True, + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val + ) + model = QuantBertForSequenceClassification.from_pretrained(output_dir, config=config, num_labels=num_labels) - report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True) - report['eval_time'] = diff_seconds - dictionary_to_json(report, os.path.join(output_dir, "test_results.json")) + output_quant_dir = os.path.join(output_dir, 'quant') + qunat_config = BertConfig.from_pretrained( + output_quant_dir, + quantize_act=True, + weight_bits=args.weight_bits, + input_bits=args.input_bits, + clip_val=args.clip_val + ) + quant_model = QuantBertForSequenceClassification.from_pretrained(output_quant_dir, config=qunat_config, + num_labels=num_labels) + + for m, out_dir in zip([model, quant_model], [output_dir, output_quant_dir]): + logger.info("\n***** Running evaluation on test dataset *****") + logger.info(" Num examples = %d", len(test_features)) + logger.info(" Batch size = %d", args.batch_size) + + eval_start_time = time.monotonic() + m.eval() + result, y_logits = do_eval(m, task_name, test_dataloader, + device, output_mode, test_labels, num_labels) + eval_end_time = time.monotonic() + + diff = timedelta(seconds=eval_end_time - eval_start_time) + diff_seconds = diff.total_seconds() + result['eval_time'] = diff_seconds + result_to_text_file(result, os.path.join(out_dir, "test_results.txt")) + + y_pred = np.argmax(y_logits, axis=1) + print('\n\t**** Classification report ****\n') + print(classification_report(test_labels.numpy(), y_pred, target_names=label_list)) + + report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True) + report['eval_time'] = diff_seconds + dictionary_to_json(report, os.path.join(out_dir, "test_results.json")) def get_optimizer(args, num_train_optimization_steps, student_model): diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py index 0c0882f4..e669a3bb 100644 --- a/TernaryBERT/run_experiments.py +++ b/TernaryBERT/run_experiments.py @@ -17,6 +17,8 @@ learning_rate = 5e-5 weight_decay = 0.01 +evaluate = False + def main(): print(PROJECT_FOLDER) @@ -72,6 +74,21 @@ def main(): logger.info(f"Training ternarybert for multiemo_en_all_sentence") run_process(cmd) + if evaluate: + cmd = 'python3 eval_quant_multiemo.py ' + options = [ + '--data_dir', 'data/multiemo2', + '--model_dir ', 'data/models/ternarybert', + '--task_name', 'multiemo_en_all_sentence', + '--output_dir', 'data/models/ternarybert' + '--weight_bits', str(2), + '--input_bits', str(8), + '--do_lower_case' + ] + cmd += ' '.join(options) + logger.info(f"Evaluating ternarybert for multiemo_en_all_sentence") + run_process(cmd) + # cmd = f'python3 -m gather_results --task_name multiemo_en_all_sentence' # logger.info(f"Gathering results to csv for multiemo_en_all_sentence") # run_process(cmd) From 7b592794805e606bb224e9a3aeebd33fa2079b3a Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Thu, 9 Dec 2021 14:49:35 +0100 Subject: [PATCH 54/62] feat: correct dir in gathering result script --- TernaryBERT/gather_results.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TernaryBERT/gather_results.py b/TernaryBERT/gather_results.py index dd925b15..9f4d96c6 100644 --- a/TernaryBERT/gather_results.py +++ b/TernaryBERT/gather_results.py @@ -11,7 +11,7 @@ PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__)) DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data') -MODELS_FOLDER = os.path.join(DATA_FOLDER, 'models', 'bert-of-theseus') +MODELS_FOLDER = os.path.join(DATA_FOLDER, 'models', 'ternarybert') def main(): @@ -49,7 +49,7 @@ def get_immediate_subdirectories(a_dir): def gather_results(model_dir: str, task_name: str) -> Dict[str, Any]: quant_model_dir = os.path.join(model_dir, 'quant') - with open(os.path.join(quant_model_dir, 'training_params.json')) as json_file: + with open(os.path.join(model_dir, 'training_params.json')) as json_file: training_data_dict = json.load(json_file) with open(os.path.join(quant_model_dir, 'test_results.json')) as json_file: From 0c82cc6c0a441f1ff52d1a18227c508c01bde6be Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Thu, 9 Dec 2021 14:50:15 +0100 Subject: [PATCH 55/62] add results --- .../data/results-ternarybert-multiemo_en_all_sentence.csv | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv diff --git a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv new file mode 100644 index 00000000..0ed4a317 --- /dev/null +++ b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv @@ -0,0 +1,2 @@ +name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters +multiemo_en_all_sentence,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316 From 4fec476b657fb8455574bb8c6817b261d4b03d18 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Thu, 9 Dec 2021 15:16:01 +0100 Subject: [PATCH 56/62] feat: add adding results for DynaBERT --- DynaBERT/gather_results.py | 112 +++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 DynaBERT/gather_results.py diff --git a/DynaBERT/gather_results.py b/DynaBERT/gather_results.py new file mode 100644 index 00000000..3f1a7ad4 --- /dev/null +++ b/DynaBERT/gather_results.py @@ -0,0 +1,112 @@ +import argparse +import json +import os +from typing import Any, Dict + +import pandas as pd + +from transformers import BertConfig, BertForSequenceClassification + +from transformers.data.processors.multiemo import MultiemoProcessor + +PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__)) +DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data') +MODELS_FOLDER = os.path.join(DATA_FOLDER, 'models') +DYNABERT_FT_MODELS_FOLDER = os.path.join(MODELS_FOLDER, 'dynabert-finetuned') + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument('--depth_mult', type=str, default='1.', + help="the possible depths used for training, e.g., '1.' is for default") + parser.add_argument('--width_mult', type=str, default='1.', + help="the possible depths used for training, e.g., '1.' is for default") + + args = parser.parse_args() + task_name = args.task_name + + models_subdirectories = get_immediate_subdirectories(DYNABERT_FT_MODELS_FOLDER) + print(DYNABERT_FT_MODELS_FOLDER) + print(models_subdirectories) + + data = list() + for subdirectory in models_subdirectories: + data_dict = gather_results(subdirectory, task_name, args.depth_mult, args.width_mult) + data.append(data_dict) + + df = pd.DataFrame(data) + cols = df.columns.tolist() + cols = cols[-2:] + cols[:-2] + df = df[cols] + df.to_csv(os.path.join(DATA_FOLDER, 'results-dynabert-' + task_name + '.csv'), index=False) + + +def get_immediate_subdirectories(a_dir): + return [os.path.join(a_dir, name) for name in os.listdir(a_dir) + if os.path.isdir(os.path.join(a_dir, name))] + + +def gather_results(model_dir: str, task_name: str, depth_mult: float, width_mult: float) -> Dict[str, Any]: + task_subfolder = os.path.basename(model_dir) + eval_output_dir = os.path.join( + model_dir, 'bert_' + str(width_mult) + '_' + str(depth_mult) + '_eval') + + with open(os.path.join(model_dir, 'training_params.json')) as json_file: + training_data_dict = json.load(json_file) + + with open(os.path.join(eval_output_dir, 'test_results.json')) as json_file: + test_data = json.load(json_file) + [test_data_dict] = pd.json_normalize(test_data, sep='_').to_dict(orient='records') + + data = training_data_dict.copy() # start with keys and values of x + data.update(test_data_dict) + + with open(os.path.join(MODELS_FOLDER, 'dynabertw', task_subfolder, 'training_params.json')) as json_file: + dynabertw_training_data_dict = json.load(json_file) + data['training_time'] = data['training_time'] + dynabertw_training_data_dict['training_time'] + + with open(os.path.join(MODELS_FOLDER, 'dynabert', task_subfolder, 'training_params.json')) as json_file: + dynabert_training_data_dict = json.load(json_file) + data['training_time'] = data['training_time'] + dynabert_training_data_dict['training_time'] + + model_size = os.path.getsize(os.path.join(model_dir, 'pytorch_model.bin')) + data['model_size'] = model_size + + if 'multiemo' not in task_name: + raise ValueError("Task not found: %s" % task_name) + + _, lang, domain, kind = task_name.split('_') + processor = MultiemoProcessor(lang, domain, kind) + label_list = processor.get_labels() + num_labels = len(label_list) + + # LOADING THE BEST MODEL + config = BertConfig.from_pretrained(model_dir, num_labels=num_labels, finetuning_task=task_name) + model = BertForSequenceClassification.from_pretrained(model_dir, config=config) + model.apply(lambda m: setattr(m, 'depth_mult', float(depth_mult))) + model.apply(lambda m: setattr(m, 'width_mult', float(width_mult))) + + memory_params = sum([param.nelement() * param.element_size() for param in model.parameters()]) + memory_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()]) + memory_used = memory_params + memory_buffers # in bytes + + data['memory'] = memory_used + + parameters_num = 0 + for n, p in model.named_parameters(): + parameters_num += p.nelement() + + data['parameters'] = parameters_num + data['name'] = os.path.basename(model_dir) + data['model_name'] = 'dynabert_d_' + str(depth_mult) + '_w_' + str(width_mult) + print(data) + return data + + +if __name__ == '__main__': + main() From d8290b59e63b95b4f32a475abf0d4afc742ac3b1 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Thu, 9 Dec 2021 15:21:15 +0100 Subject: [PATCH 57/62] Add DynaBERT results --- DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv diff --git a/DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv b/DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv new file mode 100644 index 00000000..da4102b1 --- /dev/null +++ b/DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv @@ -0,0 +1,2 @@ +name,model_name,data_dir,model_dir,output_dir,model_type,task_name,max_seq_length,do_train,evaluate_during_training,do_lower_case,per_gpu_train_batch_size,per_gpu_eval_batch_size,gradient_accumulation_steps,learning_rate,weight_decay,num_train_epochs,warmup_steps,seed,hidden_dropout_prob,attention_probs_dropout_prob,data_aug,depth_mult_list,depth_lambda1,depth_lambda2,width_mult_list,width_lambda1,width_lambda2,training_phase,n_gpu,output_mode,train_batch_size,eval_batch_size,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters +multiemo_en_all_sentence,dynabert_d_0.5_w_1.,data/multiemo2,data/models/dynabertw/multiemo_en_all_sentence,data/models/dynabert-finetuned/multiemo_en_all_sentence,bert,multiemo_en_all_sentence,128,True,True,True,16,8,1,5e-05,0.01,3.0,0,42,0.1,0.1,False,"[0.5, 0.75, 1.0]",1.0,1.0,"[0.25, 0.5, 0.75, 1.0]",1.0,0.1,final_finetuning,1,classification,16,8,34809.954543,0.7724978241949522,14.607117,0.5470219435736677,0.5124816446402349,0.5291887793783169,681,0.7953004970628107,0.8290155440414507,0.8118081180811808,2123,0.8104786545924968,0.8232588699080158,0.8168187744458931,1522,0.798219584569733,0.7582804792107117,0.7777376219732562,1419,0.737755169949677,0.7307591344501033,0.7338883234696617,5745,0.770612184792384,0.7724978241949522,0.771219156436846,5745,438020911,437941264,109485316 From f8df472777d9c5de328d61828d210baecc2f9658 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Thu, 9 Dec 2021 15:59:21 +0100 Subject: [PATCH 58/62] feat: add model name to results --- TernaryBERT/gather_results.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TernaryBERT/gather_results.py b/TernaryBERT/gather_results.py index 9f4d96c6..68ca41bc 100644 --- a/TernaryBERT/gather_results.py +++ b/TernaryBERT/gather_results.py @@ -93,6 +93,7 @@ def gather_results(model_dir: str, task_name: str) -> Dict[str, Any]: data['parameters'] = parameters_num data['name'] = os.path.basename(model_dir) + data['model_name'] = 'TernaryBERT' print(data) return data From a983e54bcf7be80fcb365f548a72e706733630b7 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Thu, 9 Dec 2021 15:59:35 +0100 Subject: [PATCH 59/62] Update results --- .../data/results-ternarybert-multiemo_en_all_sentence.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv index 0ed4a317..cc0a090d 100644 --- a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv +++ b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv @@ -1,2 +1,2 @@ -name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters -multiemo_en_all_sentence,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316 +model_name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters,name +TernaryBERT,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316,multiemo_en_all_sentence From 71cd4a5ac513ea6a57c5398eff1aecf031689398 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Thu, 9 Dec 2021 16:03:12 +0100 Subject: [PATCH 60/62] feat: minor correct --- TernaryBERT/gather_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TernaryBERT/gather_results.py b/TernaryBERT/gather_results.py index 68ca41bc..7953a52b 100644 --- a/TernaryBERT/gather_results.py +++ b/TernaryBERT/gather_results.py @@ -36,7 +36,7 @@ def main(): df = pd.DataFrame(data) cols = df.columns.tolist() - cols = cols[-1:] + cols[:-1] + cols = cols[-2:] + cols[:-2] df = df[cols] df.to_csv(os.path.join(DATA_FOLDER, 'results-ternarybert-' + task_name + '.csv'), index=False) From 8173a5c39f8d22075ca6e38fc82c02bd96556bc8 Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Thu, 9 Dec 2021 16:02:09 +0100 Subject: [PATCH 61/62] Update results 2 --- .../data/results-ternarybert-multiemo_en_all_sentence.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv index cc0a090d..c0be3ad7 100644 --- a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv +++ b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv @@ -1,2 +1,2 @@ -model_name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters,name -TernaryBERT,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316,multiemo_en_all_sentence +name,model_name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters +multiemo_en_all_sentence,TernaryBERT,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316 From 7a977d6719226ac30fa4e3732baed037ed156c7c Mon Sep 17 00:00:00 2001 From: wojtek11530 Date: Thu, 9 Dec 2021 16:31:48 +0100 Subject: [PATCH 62/62] feat: capitilize model name --- DynaBERT/gather_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DynaBERT/gather_results.py b/DynaBERT/gather_results.py index 3f1a7ad4..85544525 100644 --- a/DynaBERT/gather_results.py +++ b/DynaBERT/gather_results.py @@ -103,7 +103,7 @@ def gather_results(model_dir: str, task_name: str, depth_mult: float, width_mult data['parameters'] = parameters_num data['name'] = os.path.basename(model_dir) - data['model_name'] = 'dynabert_d_' + str(depth_mult) + '_w_' + str(width_mult) + data['model_name'] = 'Dynabert_d_' + str(depth_mult) + '_w_' + str(width_mult) print(data) return data