From 29bfdc91dfb999139bd6e153fbb2f57d5cb9c946 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 14 Sep 2021 09:38:24 +0200
Subject: [PATCH 01/62] Add script for downloading GLUE data (source:
 https://gist.github.com/vlasenkoalexey/fef1601580f269eca73bf26a198595f3)

---
 TinyBERT/scripts/download_glue_data.py | 154 +++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 TinyBERT/scripts/download_glue_data.py

diff --git a/TinyBERT/scripts/download_glue_data.py b/TinyBERT/scripts/download_glue_data.py
new file mode 100644
index 00000000..17c1a1f8
--- /dev/null
+++ b/TinyBERT/scripts/download_glue_data.py
@@ -0,0 +1,154 @@
+''' Script for downloading all GLUE data.
+
+Note: for legal reasons, we are unable to host MRPC.
+You can either use the version hosted by the SentEval team, which is already tokenized, 
+or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
+For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
+You should then rename and place specific files in a folder (see below for an example).
+
+mkdir MRPC
+cabextract MSRParaphraseCorpus.msi -d MRPC
+cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
+cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
+rm MRPC/_*
+rm MSRParaphraseCorpus.msi
+
+1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
+2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
+'''
+
+import argparse
+import io
+import os
+import sys
+import shutil
+import tempfile
+import urllib.request
+import zipfile
+
+URLLIB = urllib.request
+
+TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
+TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',
+             "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',
+             "QQP":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
+             "STS":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',
+             "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',
+             "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',
+             "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
+             "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',
+             "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv',
+             'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'}
+
+MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
+MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
+
+def download_and_extract(task, data_dir):
+    print("Downloading and extracting %s..." % task)
+    if task == "MNLI":
+        print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")
+    data_file = "%s.zip" % task
+    urllib.request.urlretrieve(TASK2PATH[task], data_file)
+    with zipfile.ZipFile(data_file) as zip_ref:
+        zip_ref.extractall(data_dir)
+    os.remove(data_file)
+    print("\tCompleted!")
+
+def format_mrpc(data_dir, path_to_data):
+    print("Processing MRPC...")
+    mrpc_dir = os.path.join(data_dir, "MRPC")
+    if not os.path.isdir(mrpc_dir):
+        os.mkdir(mrpc_dir)
+    if path_to_data:
+        mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
+        mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
+    else:
+        try:
+            mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+            mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+            URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file)
+            URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file)
+        except urllib.error.HTTPError:
+            print("Error downloading MRPC")
+            return
+    assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
+    assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
+
+    with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
+            io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
+        header = data_fh.readline()
+        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+        for idx, row in enumerate(data_fh):
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+
+    try:
+        URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
+    except KeyError or urllib.error.HTTPError:
+        print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")
+        return
+
+    dev_ids = []
+    with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
+        for row in ids_fh:
+            dev_ids.append(row.strip().split('\t'))
+
+    with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
+         io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
+         io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
+        header = data_fh.readline()
+        train_fh.write(header)
+        dev_fh.write(header)
+        for row in data_fh:
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            if [id1, id2] in dev_ids:
+                dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+            else:
+                train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+                
+    print("\tCompleted!")
+    
+def download_diagnostic(data_dir):
+    print("Downloading and extracting diagnostic...")
+    if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
+        os.mkdir(os.path.join(data_dir, "diagnostic"))
+    data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
+    urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
+    print("\tCompleted!")
+    return
+
+def get_tasks(task_names):
+    task_names = task_names.split(',')
+    if "all" in task_names:
+        tasks = TASKS
+    else:
+        tasks = []
+        for task_name in task_names:
+            assert task_name in TASKS, "Task %s not found!" % task_name
+            tasks.append(task_name)
+    return tasks
+
+def main(arguments):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
+    parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
+                        type=str, default='all')
+    parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
+                        type=str, default='')
+    args = parser.parse_args(arguments)
+
+    if not os.path.isdir(args.data_dir):
+        os.mkdir(args.data_dir)
+    tasks = get_tasks(args.tasks)
+
+    for task in tasks:
+        if task == 'MRPC':
+            format_mrpc(args.data_dir, args.path_to_mrpc)
+        elif task == 'diagnostic':
+            download_diagnostic(args.data_dir)
+        else:
+            download_and_extract(task, args.data_dir)
+
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))
\ No newline at end of file

From c2dceb7802376fbbcdc46a156b433fbb8e223d66 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 14 Sep 2021 09:39:04 +0200
Subject: [PATCH 02/62] Add blank data folder

---
 TinyBERT/data/.gitkeep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 TinyBERT/data/.gitkeep

diff --git a/TinyBERT/data/.gitkeep b/TinyBERT/data/.gitkeep
new file mode 100644
index 00000000..e69de29b

From b05fee5d1451ac1100d739a737a06bc14172aa2a Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 14 Sep 2021 11:15:08 +0200
Subject: [PATCH 03/62] Comment logging candidate words

---
 TinyBERT/data_augmentation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TinyBERT/data_augmentation.py b/TinyBERT/data_augmentation.py
index b817b865..77dd989f 100644
--- a/TinyBERT/data_augmentation.py
+++ b/TinyBERT/data_augmentation.py
@@ -208,7 +208,7 @@ def augment(self, sent):
         for (idx, word) in enumerate(tokens):
             if _is_valid(word) and word not in StopWordsList:
                 candidate_words[idx] = self._word_augment(sent, idx, word)
-        logger.info(candidate_words)
+        # logger.info(candidate_words)
         cnt = 0
         while cnt < self.N:
             new_sent = list(tokens)

From 45a26dad2e9a8dd6097ff7272dff0a91f27ce48d Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 14 Sep 2021 11:25:54 +0200
Subject: [PATCH 04/62] Log info about augmentation status more frequently

---
 TinyBERT/data_augmentation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TinyBERT/data_augmentation.py b/TinyBERT/data_augmentation.py
index 77dd989f..c9d6224f 100644
--- a/TinyBERT/data_augmentation.py
+++ b/TinyBERT/data_augmentation.py
@@ -261,7 +261,7 @@ def read_augment_write(self):
                         line[augment_id] = augment_sent
                         writer.writerow(line)
 
-                if (i+1) % 1000 == 0:
+                if (i+1) % 50 == 0:
                     logger.info("Having been processing {} examples".format(str(i+1)))
 
 

From 780cdc30e9e7ff7ef4f3ebc3e86c33d0778ec887 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 14 Sep 2021 13:43:56 +0200
Subject: [PATCH 05/62] Log info about loading GloVe embeddings

---
 TinyBERT/data_augmentation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/TinyBERT/data_augmentation.py b/TinyBERT/data_augmentation.py
index c9d6224f..ee043444 100644
--- a/TinyBERT/data_augmentation.py
+++ b/TinyBERT/data_augmentation.py
@@ -89,6 +89,7 @@ def _read_tsv(input_file, quotechar=None):
 
 
 def prepare_embedding_retrieval(glove_file, vocab_size=100000):
+    logger.info('Preparing GloVe embedding started')
     cnt = 0
     words = []
     embeddings = {}
@@ -117,6 +118,7 @@ def prepare_embedding_retrieval(glove_file, vocab_size=100000):
     # normalize each word vector
     d = (np.sum(emb_matrix ** 2, 1) ** 0.5)
     emb_norm = (emb_matrix.T / d).T
+    logger.info('Preparing GloVe embedding finished')
     return emb_norm, vocab, ids_to_tokens
 
 

From bc9394d45cc1bae1f8b2c360a4d2823cab7de20f Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 14 Sep 2021 13:44:25 +0200
Subject: [PATCH 06/62] Add script for standard fine-tuning of pretrained
 models

---
 TinyBERT/data_processing.py | 573 ++++++++++++++++++++++++++++++++++++
 TinyBERT/fine_tune_bert.py  | 465 +++++++++++++++++++++++++++++
 TinyBERT/task_distill.py    | 569 +----------------------------------
 3 files changed, 1041 insertions(+), 566 deletions(-)
 create mode 100644 TinyBERT/data_processing.py
 create mode 100644 TinyBERT/fine_tune_bert.py

diff --git a/TinyBERT/data_processing.py b/TinyBERT/data_processing.py
new file mode 100644
index 00000000..497dc65c
--- /dev/null
+++ b/TinyBERT/data_processing.py
@@ -0,0 +1,573 @@
+import csv
+import os
+import sys
+
+import torch
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from torch.utils.data import TensorDataset
+
+from task_distill import logger
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.seq_length = seq_length
+        self.label_id = label_id
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_aug_examples(self, data_dir):
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+
+    def get_aug_examples(self, data_dir):
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
+            "dev_matched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_aug_examples(self, data_dir):
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_aug_examples(self, data_dir):
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[0]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_aug_examples(self, data_dir):
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[7]
+            text_b = line[8]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_aug_examples(self, data_dir):
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            try:
+                text_a = line[3]
+                text_b = line[4]
+                label = line[5]
+            except IndexError:
+                continue
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
+            "dev_matched")
+
+    def get_aug_examples(self, data_dir):
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_aug_examples(self, data_dir):
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer, output_mode):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[:(max_seq_length - 2)]
+
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        input_mask = [1] * len(input_ids)
+        seq_length = len(input_ids)
+
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        if output_mode == "classification":
+            label_id = label_map[example.label]
+        elif output_mode == "regression":
+            label_id = float(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        if ex_index < 1:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join(
+                [str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info(
+                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: {}".format(example.label))
+            logger.info("label_id: {}".format(label_id))
+
+        features.append(
+            InputFeatures(input_ids=input_ids,
+                          input_mask=input_mask,
+                          segment_ids=segment_ids,
+                          label_id=label_id,
+                          seq_length=seq_length))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def compute_metrics(task_name, preds, labels):
+    assert len(preds) == len(labels)
+    if task_name == "cola":
+        return {"mcc": matthews_corrcoef(labels, preds)}
+    elif task_name == "sst-2":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mrpc":
+        return acc_and_f1(preds, labels)
+    elif task_name == "sts-b":
+        return pearson_and_spearman(preds, labels)
+    elif task_name == "qqp":
+        return acc_and_f1(preds, labels)
+    elif task_name == "mnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mnli-mm":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "qnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "rte":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "wnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
+
+
+def get_tensor_data(output_mode, features):
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_label_ids, all_seq_lengths)
+    return tensor_data, all_label_ids
+
+
+processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor
+}
+
+output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification"
+}
diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py
new file mode 100644
index 00000000..b129c982
--- /dev/null
+++ b/TinyBERT/fine_tune_bert.py
@@ -0,0 +1,465 @@
+# coding=utf-8
+# 2019.12.2-Changed for TinyBERT task-specific distillation
+#      Huawei Technologies Co., Ltd. <yinyichun@huawei.com>
+# Copyright 2020 Huawei Technologies Co., Ltd.
+# Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import csv
+import logging
+import os
+import random
+import sys
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler)
+from tqdm import tqdm, trange
+
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from data_processing import convert_examples_to_features, \
+    compute_metrics, get_tensor_data, processors, output_modes
+from transformer.modeling import TinyBertForSequenceClassification
+from transformer.tokenization import BertTokenizer
+from transformer.optimization import BertAdam
+from transformer.file_utils import WEIGHTS_NAME, CONFIG_NAME
+
+csv.field_size_limit(sys.maxsize)
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt='%m/%d %I:%M:%S %p')
+fh = logging.FileHandler('debug_layer_loss.log')
+fh.setFormatter(logging.Formatter(log_format))
+logging.getLogger().addHandler(fh)
+logger = logging.getLogger()
+
+oncloud = True
+try:
+    import moxing as mox
+except:
+    oncloud = False
+
+
+def result_to_file(result, file_name):
+    with open(file_name, "a") as writer:
+        logger.info("***** Eval results *****")
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+def do_eval(model, task_name, eval_dataloader,
+            device, output_mode, eval_labels, num_labels):
+    eval_loss = 0
+    nb_eval_steps = 0
+    preds = []
+
+    for batch_ in tqdm(eval_dataloader, desc="Evaluating"):
+        batch_ = tuple(t.to(device) for t in batch_)
+        with torch.no_grad():
+            input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_
+
+            logits, _, _ = model(input_ids, segment_ids, input_mask)
+
+        # create eval loss and other metric required by the task
+        if output_mode == "classification":
+            loss_fct = CrossEntropyLoss()
+            tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
+        elif output_mode == "regression":
+            loss_fct = MSELoss()
+            tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
+
+        eval_loss += tmp_eval_loss.mean().item()
+        nb_eval_steps += 1
+        if len(preds) == 0:
+            preds.append(logits.detach().cpu().numpy())
+        else:
+            preds[0] = np.append(
+                preds[0], logits.detach().cpu().numpy(), axis=0)
+
+    eval_loss = eval_loss / nb_eval_steps
+
+    preds = preds[0]
+    if output_mode == "classification":
+        preds = np.argmax(preds, axis=1)
+    elif output_mode == "regression":
+        preds = np.squeeze(preds)
+    result = compute_metrics(task_name, preds, eval_labels.numpy())
+    result['eval_loss'] = eval_loss
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--pretrained_model",
+                        default=None,
+                        type=str,
+                        help="The pretrained model dir.")
+    parser.add_argument("--task_name",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The name of the task to train.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_eval",
+                        action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument('--weight_decay', '--wd',
+                        default=1e-4,
+                        type=float,
+                        metavar='W',
+                        help='weight decay')
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.1,
+                        type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+
+    # added arguments
+    parser.add_argument('--aug_train',
+                        action='store_true')
+    parser.add_argument('--eval_step',
+                        type=int,
+                        default=50)
+    parser.add_argument('--data_url',
+                        type=str,
+                        default="")
+
+    args = parser.parse_args()
+    logger.info('The args: {}'.format(args))
+
+    # intermediate distillation default parameters
+    default_params = {
+        "cola": {"num_train_epochs": 50, "max_seq_length": 64},
+        "mnli": {"num_train_epochs": 5, "max_seq_length": 128},
+        "mrpc": {"num_train_epochs": 20, "max_seq_length": 128},
+        "sst-2": {"num_train_epochs": 10, "max_seq_length": 64},
+        "sts-b": {"num_train_epochs": 20, "max_seq_length": 128},
+        "qqp": {"num_train_epochs": 5, "max_seq_length": 128},
+        "qnli": {"num_train_epochs": 10, "max_seq_length": 128},
+        "rte": {"num_train_epochs": 20, "max_seq_length": 128}
+    }
+
+    acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
+    corr_tasks = ["sts-b"]
+    mcc_tasks = ["cola"]
+
+    # Prepare devices
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    n_gpu = torch.cuda.device_count()
+
+    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt='%m/%d/%Y %H:%M:%S',
+                        level=logging.INFO)
+
+    logger.info("device: {} n_gpu: {}".format(device, n_gpu))
+
+    # Prepare seed
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    # Prepare task settings
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    task_name = args.task_name.lower()
+
+    if task_name in default_params:
+        args.max_seq_len = default_params[task_name]["max_seq_length"]
+
+    if not args.do_eval:
+        if task_name in default_params:
+            args.num_train_epoch = default_params[task_name]["num_train_epochs"]
+
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % task_name)
+
+    processor = processors[task_name]()
+    output_mode = output_modes[task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=args.do_lower_case)
+
+    if not args.do_eval:
+        if not args.aug_train:
+            train_examples = processor.get_train_examples(args.data_dir)
+        else:
+            train_examples = processor.get_aug_examples(args.data_dir)
+        if args.gradient_accumulation_steps < 1:
+            raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                args.gradient_accumulation_steps))
+
+        args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+        num_train_optimization_steps = int(
+            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+
+        train_features = convert_examples_to_features(train_examples, label_list,
+                                                      args.max_seq_length, tokenizer, output_mode)
+        train_data, _ = get_tensor_data(output_mode, train_features)
+        train_sampler = RandomSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    eval_examples = processor.get_dev_examples(args.data_dir)
+    eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+    eval_data, eval_labels = get_tensor_data(output_mode, eval_features)
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    model = TinyBertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels)
+    model.to(device)
+
+    if args.do_eval:
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_examples))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+
+        model.eval()
+        result = do_eval(model, task_name, eval_dataloader,
+                         device, output_mode, eval_labels, num_labels)
+        logger.info("***** Eval results *****")
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+    else:
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+        if n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+        # Prepare optimizer
+        param_optimizer = list(model.named_parameters())
+        size = 0
+        for n, p in model.named_parameters():
+            logger.info('n: {}'.format(n))
+            size += p.nelement()
+
+        logger.info('Total parameters: {}'.format(size))
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+        schedule = 'warmup_linear'
+        if not args.pred_distill:
+            schedule = 'none'
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                             schedule=schedule,
+                             lr=args.learning_rate,
+                             warmup=args.warmup_proportion,
+                             t_total=num_train_optimization_steps)
+
+        # Train and evaluate
+        global_step = 0
+        best_dev_acc = 0.0
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+
+        for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0.
+            tr_cls_loss = 0.
+
+            model.train()
+            nb_tr_examples, nb_tr_steps = 0, 0
+
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", ascii=True)):
+                batch = tuple(t.to(device) for t in batch)
+
+                input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch
+                if input_ids.size()[0] != args.train_batch_size:
+                    continue
+
+                cls_loss = 0.
+
+                logits, _, _ = model(input_ids, segment_ids, input_mask)
+
+                if output_mode == "classification":
+                    loss_fct = CrossEntropyLoss()
+                    cls_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
+                elif output_mode == "regression":
+                    loss_mse = MSELoss()
+                    cls_loss = loss_mse(logits.view(-1), label_ids.view(-1))
+
+                loss = cls_loss
+                tr_cls_loss += cls_loss.item()
+
+                if n_gpu > 1:
+                    loss = loss.mean()  # mean() to average on multi-gpu.
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                loss.backward()
+
+                tr_loss += loss.item()
+                nb_tr_examples += label_ids.size(0)
+                nb_tr_steps += 1
+
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+
+                if (global_step + 1) % args.eval_step == 0:
+                    logger.info("***** Running evaluation *****")
+                    logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))
+                    logger.info("  Num examples = %d", len(eval_examples))
+                    logger.info("  Batch size = %d", args.eval_batch_size)
+
+                    model.eval()
+
+                    loss = tr_loss / (step + 1)
+                    cls_loss = tr_cls_loss / (step + 1)
+
+                    result = {}
+                    if args.pred_distill:
+                        result = do_eval(model, task_name, eval_dataloader,
+                                         device, output_mode, eval_labels, num_labels)
+                    result['global_step'] = global_step
+                    result['cls_loss'] = cls_loss
+                    result['loss'] = loss
+
+                    result_to_file(result, output_eval_file)
+
+                    if not args.pred_distill:
+                        save_model = True
+                    else:
+                        save_model = False
+
+                        if task_name in acc_tasks and result['acc'] > best_dev_acc:
+                            best_dev_acc = result['acc']
+                            save_model = True
+
+                        if task_name in corr_tasks and result['corr'] > best_dev_acc:
+                            best_dev_acc = result['corr']
+                            save_model = True
+
+                        if task_name in mcc_tasks and result['mcc'] > best_dev_acc:
+                            best_dev_acc = result['mcc']
+                            save_model = True
+
+                    if save_model:
+                        logger.info("***** Save model *****")
+
+                        model_to_save = model.module if hasattr(model, 'module') else model
+
+                        model_name = WEIGHTS_NAME
+                        # if not args.pred_distill:
+                        #     model_name = "step_{}_{}".format(global_step, WEIGHTS_NAME)
+                        output_model_file = os.path.join(args.output_dir, model_name)
+                        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+                        torch.save(model_to_save.state_dict(), output_model_file)
+                        model_to_save.config.to_json_file(output_config_file)
+                        tokenizer.save_vocabulary(args.output_dir)
+
+                        # Test mnli-mm
+                        if args.pred_distill and task_name == "mnli":
+                            task_name = "mnli-mm"
+                            processor = processors[task_name]()
+                            if not os.path.exists(args.output_dir + '-MM'):
+                                os.makedirs(args.output_dir + '-MM')
+
+                            eval_examples = processor.get_dev_examples(args.data_dir)
+
+                            eval_features = convert_examples_to_features(
+                                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+                            eval_data, eval_labels = get_tensor_data(output_mode, eval_features)
+
+                            logger.info("***** Running mm evaluation *****")
+                            logger.info("  Num examples = %d", len(eval_examples))
+                            logger.info("  Batch size = %d", args.eval_batch_size)
+
+                            eval_sampler = SequentialSampler(eval_data)
+                            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler,
+                                                         batch_size=args.eval_batch_size)
+
+                            result = do_eval(model, task_name, eval_dataloader,
+                                             device, output_mode, eval_labels, num_labels)
+
+                            result['global_step'] = global_step
+
+                            tmp_output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
+                            result_to_file(result, tmp_output_eval_file)
+
+                            task_name = 'mnli'
+
+                        if oncloud:
+                            logging.info(mox.file.list_directory(args.output_dir, recursive=True))
+                            logging.info(mox.file.list_directory('.', recursive=True))
+                            mox.file.copy_parallel(args.output_dir, args.data_url)
+                            mox.file.copy_parallel('.', args.data_url)
+
+                    model.train()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py
index 16905a31..39628a38 100644
--- a/TinyBERT/task_distill.py
+++ b/TinyBERT/task_distill.py
@@ -29,14 +29,13 @@
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler)
 from tqdm import tqdm, trange
 
 from torch.nn import CrossEntropyLoss, MSELoss
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import matthews_corrcoef, f1_score
 
+from data_processing import convert_examples_to_features, \
+    compute_metrics, get_tensor_data, processors, output_modes
 from transformer.modeling import TinyBertForSequenceClassification
 from transformer.tokenization import BertTokenizer
 from transformer.optimization import BertAdam
@@ -59,543 +58,6 @@
     oncloud = False
 
 
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence. For single
-            sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second sequence.
-            Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.seq_length = seq_length
-        self.label_id = label_id
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
-                lines.append(line)
-            return lines
-
-
-class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_aug_examples(self, data_dir):
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            text_b = line[4]
-            label = line[0]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-            "dev_matched")
-
-    def get_aug_examples(self, data_dir):
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[8]
-            text_b = line[9]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliMismatchedProcessor(MnliProcessor):
-    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
-            "dev_matched")
-
-
-class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_aug_examples(self, data_dir):
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            label = line[1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class Sst2Processor(DataProcessor):
-    """Processor for the SST-2 data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_aug_examples(self, data_dir):
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[0]
-            label = line[1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class StsbProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_aug_examples(self, data_dir):
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
-
-    def get_labels(self):
-        """See base class."""
-        return [None]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[7]
-            text_b = line[8]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QqpProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_aug_examples(self, data_dir):
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            try:
-                text_a = line[3]
-                text_b = line[4]
-                label = line[5]
-            except IndexError:
-                continue
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QnliProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
-            "dev_matched")
-
-    def get_aug_examples(self, data_dir):
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class RteProcessor(DataProcessor):
-    """Processor for the RTE data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_aug_examples(self, data_dir):
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class WnliProcessor(DataProcessor):
-    """Processor for the WNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer, output_mode):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-        tokens_a = tokenizer.tokenize(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[:(max_seq_length - 2)]
-
-        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
-        segment_ids = [0] * len(tokens)
-
-        if tokens_b:
-            tokens += tokens_b + ["[SEP]"]
-            segment_ids += [1] * (len(tokens_b) + 1)
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-        input_mask = [1] * len(input_ids)
-        seq_length = len(input_ids)
-
-        padding = [0] * (max_seq_length - len(input_ids))
-        input_ids += padding
-        input_mask += padding
-        segment_ids += padding
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-
-        if output_mode == "classification":
-            label_id = label_map[example.label]
-        elif output_mode == "regression":
-            label_id = float(example.label)
-        else:
-            raise KeyError(output_mode)
-
-        if ex_index < 1:
-            logger.info("*** Example ***")
-            logger.info("guid: %s" % (example.guid))
-            logger.info("tokens: %s" % " ".join(
-                [str(x) for x in tokens]))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-            logger.info("label: {}".format(example.label))
-            logger.info("label_id: {}".format(label_id))
-
-        features.append(
-            InputFeatures(input_ids=input_ids,
-                          input_mask=input_mask,
-                          segment_ids=segment_ids,
-                          label_id=label_id,
-                          seq_length=seq_length))
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
-
-
-def acc_and_f1(preds, labels):
-    acc = simple_accuracy(preds, labels)
-    f1 = f1_score(y_true=labels, y_pred=preds)
-    return {
-        "acc": acc,
-        "f1": f1,
-        "acc_and_f1": (acc + f1) / 2,
-    }
-
-
-def pearson_and_spearman(preds, labels):
-    pearson_corr = pearsonr(preds, labels)[0]
-    spearman_corr = spearmanr(preds, labels)[0]
-    return {
-        "pearson": pearson_corr,
-        "spearmanr": spearman_corr,
-        "corr": (pearson_corr + spearman_corr) / 2,
-    }
-
-
-def compute_metrics(task_name, preds, labels):
-    assert len(preds) == len(labels)
-    if task_name == "cola":
-        return {"mcc": matthews_corrcoef(labels, preds)}
-    elif task_name == "sst-2":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mrpc":
-        return acc_and_f1(preds, labels)
-    elif task_name == "sts-b":
-        return pearson_and_spearman(preds, labels)
-    elif task_name == "qqp":
-        return acc_and_f1(preds, labels)
-    elif task_name == "mnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mnli-mm":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "qnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "rte":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "wnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    else:
-        raise KeyError(task_name)
-
-
-def get_tensor_data(output_mode, features):
-    if output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
-    elif output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
-
-    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
-    tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_label_ids, all_seq_lengths)
-    return tensor_data, all_label_ids
-
-
 def result_to_file(result, file_name):
     with open(file_name, "a") as writer:
         logger.info("***** Eval results *****")
@@ -744,31 +206,6 @@ def main():
     args = parser.parse_args()
     logger.info('The args: {}'.format(args))
 
-    processors = {
-        "cola": ColaProcessor,
-        "mnli": MnliProcessor,
-        "mnli-mm": MnliMismatchedProcessor,
-        "mrpc": MrpcProcessor,
-        "sst-2": Sst2Processor,
-        "sts-b": StsbProcessor,
-        "qqp": QqpProcessor,
-        "qnli": QnliProcessor,
-        "rte": RteProcessor,
-        "wnli": WnliProcessor
-    }
-
-    output_modes = {
-        "cola": "classification",
-        "mnli": "classification",
-        "mrpc": "classification",
-        "sst-2": "classification",
-        "sts-b": "regression",
-        "qqp": "classification",
-        "qnli": "classification",
-        "rte": "classification",
-        "wnli": "classification"
-    }
-
     # intermediate distillation default parameters
     default_params = {
         "cola": {"num_train_epochs": 50, "max_seq_length": 64},

From 906c120f51ec0d1a47617db6cc607029148bfd42 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 15 Sep 2021 09:16:59 +0200
Subject: [PATCH 07/62] Correct fine-tuning

---
 TinyBERT/fine_tune_bert.py | 57 ++++++++++++++++++--------------------
 TinyBERT/task_distill.py   |  4 +++
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py
index b129c982..4021bca9 100644
--- a/TinyBERT/fine_tune_bert.py
+++ b/TinyBERT/fine_tune_bert.py
@@ -194,14 +194,14 @@ def main():
 
     # intermediate distillation default parameters
     default_params = {
-        "cola": {"num_train_epochs": 50, "max_seq_length": 64},
-        "mnli": {"num_train_epochs": 5, "max_seq_length": 128},
-        "mrpc": {"num_train_epochs": 20, "max_seq_length": 128},
-        "sst-2": {"num_train_epochs": 10, "max_seq_length": 64},
-        "sts-b": {"num_train_epochs": 20, "max_seq_length": 128},
-        "qqp": {"num_train_epochs": 5, "max_seq_length": 128},
-        "qnli": {"num_train_epochs": 10, "max_seq_length": 128},
-        "rte": {"num_train_epochs": 20, "max_seq_length": 128}
+        "cola": {"num_train_epochs": 3, "max_seq_length": 64},
+        "mnli": {"num_train_epochs": 3, "max_seq_length": 128},
+        "mrpc": {"num_train_epochs": 3, "max_seq_length": 128},
+        "sst-2": {"num_train_epochs": 3, "max_seq_length": 64},
+        "sts-b": {"num_train_epochs": 3, "max_seq_length": 128},
+        "qqp": {"num_train_epochs": 3, "max_seq_length": 128},
+        "qnli": {"num_train_epochs": 3, "max_seq_length": 128},
+        "rte": {"num_train_epochs": 3, "max_seq_length": 128}
     }
 
     acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
@@ -311,8 +311,7 @@ def main():
             {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
         ]
         schedule = 'warmup_linear'
-        if not args.pred_distill:
-            schedule = 'none'
+
         optimizer = BertAdam(optimizer_grouped_parameters,
                              schedule=schedule,
                              lr=args.learning_rate,
@@ -379,32 +378,31 @@ def main():
                     loss = tr_loss / (step + 1)
                     cls_loss = tr_cls_loss / (step + 1)
 
-                    result = {}
-                    if args.pred_distill:
-                        result = do_eval(model, task_name, eval_dataloader,
-                                         device, output_mode, eval_labels, num_labels)
+                    result = do_eval(model, task_name, eval_dataloader,
+                                     device, output_mode, eval_labels, num_labels)
                     result['global_step'] = global_step
                     result['cls_loss'] = cls_loss
                     result['loss'] = loss
 
+                    logger.info("***** Eval results *****")
+                    for key in sorted(result.keys()):
+                        logger.info("  %s = %s", key, str(result[key]))
+
                     result_to_file(result, output_eval_file)
 
-                    if not args.pred_distill:
-                        save_model = True
-                    else:
-                        save_model = False
+                    save_model = False
 
-                        if task_name in acc_tasks and result['acc'] > best_dev_acc:
-                            best_dev_acc = result['acc']
-                            save_model = True
+                    if task_name in acc_tasks and result['acc'] > best_dev_acc:
+                        best_dev_acc = result['acc']
+                        save_model = True
 
-                        if task_name in corr_tasks and result['corr'] > best_dev_acc:
-                            best_dev_acc = result['corr']
-                            save_model = True
+                    if task_name in corr_tasks and result['corr'] > best_dev_acc:
+                        best_dev_acc = result['corr']
+                        save_model = True
 
-                        if task_name in mcc_tasks and result['mcc'] > best_dev_acc:
-                            best_dev_acc = result['mcc']
-                            save_model = True
+                    if task_name in mcc_tasks and result['mcc'] > best_dev_acc:
+                        best_dev_acc = result['mcc']
+                        save_model = True
 
                     if save_model:
                         logger.info("***** Save model *****")
@@ -412,8 +410,7 @@ def main():
                         model_to_save = model.module if hasattr(model, 'module') else model
 
                         model_name = WEIGHTS_NAME
-                        # if not args.pred_distill:
-                        #     model_name = "step_{}_{}".format(global_step, WEIGHTS_NAME)
+
                         output_model_file = os.path.join(args.output_dir, model_name)
                         output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
 
@@ -422,7 +419,7 @@ def main():
                         tokenizer.save_vocabulary(args.output_dir)
 
                         # Test mnli-mm
-                        if args.pred_distill and task_name == "mnli":
+                        if task_name == "mnli":
                             task_name = "mnli-mm"
                             processor = processors[task_name]()
                             if not os.path.exists(args.output_dir + '-MM'):
diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py
index 39628a38..b130d80a 100644
--- a/TinyBERT/task_distill.py
+++ b/TinyBERT/task_distill.py
@@ -451,6 +451,10 @@ def soft_cross_entropy(predicts, targets):
                     result['rep_loss'] = rep_loss
                     result['loss'] = loss
 
+                    logger.info("***** Eval results *****")
+                    for key in sorted(result.keys()):
+                        logger.info("  %s = %s", key, str(result[key]))
+
                     result_to_file(result, output_eval_file)
 
                     if not args.pred_distill:

From 3dd28f91a8a0af39cd62db87dcdcb03c02200b16 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 15 Sep 2021 09:41:08 +0200
Subject: [PATCH 08/62] Correct logging

---
 TinyBERT/data_processing.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/TinyBERT/data_processing.py b/TinyBERT/data_processing.py
index 497dc65c..5ab7c242 100644
--- a/TinyBERT/data_processing.py
+++ b/TinyBERT/data_processing.py
@@ -1,4 +1,5 @@
 import csv
+import logging
 import os
 import sys
 
@@ -7,7 +8,10 @@
 from sklearn.metrics import f1_score, matthews_corrcoef
 from torch.utils.data import TensorDataset
 
-from task_distill import logger
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
 
 
 class InputExample(object):

From 40350397b2e5ace351077c70619812967894d5b1 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 15 Sep 2021 09:42:23 +0200
Subject: [PATCH 09/62] Add minor correct

---
 TinyBERT/fine_tune_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py
index 4021bca9..837a60bf 100644
--- a/TinyBERT/fine_tune_bert.py
+++ b/TinyBERT/fine_tune_bert.py
@@ -248,7 +248,7 @@ def main():
     label_list = processor.get_labels()
     num_labels = len(label_list)
 
-    tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=args.do_lower_case)
+    tokenizer = BertTokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case)
 
     if not args.do_eval:
         if not args.aug_train:

From c605dbf53d3d82dbdcafa281a967ae7ee073f307 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 15 Sep 2021 09:58:27 +0200
Subject: [PATCH 10/62] Add minor corrects

---
 TinyBERT/fine_tune_bert.py | 6 +-----
 TinyBERT/task_distill.py   | 4 ----
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py
index 837a60bf..10adafa6 100644
--- a/TinyBERT/fine_tune_bert.py
+++ b/TinyBERT/fine_tune_bert.py
@@ -367,7 +367,7 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-                if (global_step + 1) % args.eval_step == 0:
+                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == num_train_optimization_steps:
                     logger.info("***** Running evaluation *****")
                     logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))
                     logger.info("  Num examples = %d", len(eval_examples))
@@ -384,10 +384,6 @@ def main():
                     result['cls_loss'] = cls_loss
                     result['loss'] = loss
 
-                    logger.info("***** Eval results *****")
-                    for key in sorted(result.keys()):
-                        logger.info("  %s = %s", key, str(result[key]))
-
                     result_to_file(result, output_eval_file)
 
                     save_model = False
diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py
index b130d80a..39628a38 100644
--- a/TinyBERT/task_distill.py
+++ b/TinyBERT/task_distill.py
@@ -451,10 +451,6 @@ def soft_cross_entropy(predicts, targets):
                     result['rep_loss'] = rep_loss
                     result['loss'] = loss
 
-                    logger.info("***** Eval results *****")
-                    for key in sorted(result.keys()):
-                        logger.info("  %s = %s", key, str(result[key]))
-
                     result_to_file(result, output_eval_file)
 
                     if not args.pred_distill:

From efd7a7d01104154e2385107fecb801129f40dfd4 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 15 Sep 2021 10:00:33 +0200
Subject: [PATCH 11/62] Add minor corrects

---
 TinyBERT/fine_tune_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py
index 10adafa6..b511e429 100644
--- a/TinyBERT/fine_tune_bert.py
+++ b/TinyBERT/fine_tune_bert.py
@@ -367,7 +367,7 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == num_train_optimization_steps:
+                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) in [1, num_train_optimization_steps]:
                     logger.info("***** Running evaluation *****")
                     logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))
                     logger.info("  Num examples = %d", len(eval_examples))

From 065a08210986b3f518cd57cb789a7d481561c37f Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 15 Sep 2021 10:08:34 +0200
Subject: [PATCH 12/62] Add minor corrects

---
 TinyBERT/fine_tune_bert.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py
index b511e429..9dd52d92 100644
--- a/TinyBERT/fine_tune_bert.py
+++ b/TinyBERT/fine_tune_bert.py
@@ -194,14 +194,14 @@ def main():
 
     # intermediate distillation default parameters
     default_params = {
-        "cola": {"num_train_epochs": 3, "max_seq_length": 64},
-        "mnli": {"num_train_epochs": 3, "max_seq_length": 128},
-        "mrpc": {"num_train_epochs": 3, "max_seq_length": 128},
-        "sst-2": {"num_train_epochs": 3, "max_seq_length": 64},
-        "sts-b": {"num_train_epochs": 3, "max_seq_length": 128},
-        "qqp": {"num_train_epochs": 3, "max_seq_length": 128},
-        "qnli": {"num_train_epochs": 3, "max_seq_length": 128},
-        "rte": {"num_train_epochs": 3, "max_seq_length": 128}
+        "cola": {"num_train_epochs": 5, "max_seq_length": 64},
+        "mnli": {"num_train_epochs": 5, "max_seq_length": 128},
+        "mrpc": {"num_train_epochs": 5, "max_seq_length": 128},
+        "sst-2": {"num_train_epochs": 5, "max_seq_length": 64},
+        "sts-b": {"num_train_epochs": 5, "max_seq_length": 128},
+        "qqp": {"num_train_epochs": 5, "max_seq_length": 128},
+        "qnli": {"num_train_epochs": 5, "max_seq_length": 128},
+        "rte": {"num_train_epochs": 5, "max_seq_length": 128}
     }
 
     acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
@@ -367,7 +367,8 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) in [1, num_train_optimization_steps]:
+                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 1 or \
+                        (global_step + 1) == num_train_optimization_steps:
                     logger.info("***** Running evaluation *****")
                     logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))
                     logger.info("  Num examples = %d", len(eval_examples))

From e68c110649ab4283ce80a37bb4a9bb8a66e39526 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 15 Sep 2021 10:11:51 +0200
Subject: [PATCH 13/62] Correct saving results

---
 TinyBERT/fine_tune_bert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py
index 9dd52d92..f90fd5e3 100644
--- a/TinyBERT/fine_tune_bert.py
+++ b/TinyBERT/fine_tune_bert.py
@@ -60,6 +60,7 @@
 
 def result_to_file(result, file_name):
     with open(file_name, "a") as writer:
+        writer.write("")
         logger.info("***** Eval results *****")
         for key in sorted(result.keys()):
             logger.info("  %s = %s", key, str(result[key]))

From 2c69c978b7e16c67110498d114f830da2648eca7 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 15 Sep 2021 10:28:31 +0200
Subject: [PATCH 14/62] Add small correction

---
 TinyBERT/fine_tune_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py
index f90fd5e3..4b7a9881 100644
--- a/TinyBERT/fine_tune_bert.py
+++ b/TinyBERT/fine_tune_bert.py
@@ -368,7 +368,7 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 1 or \
+                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 0 or \
                         (global_step + 1) == num_train_optimization_steps:
                     logger.info("***** Running evaluation *****")
                     logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))

From b12e2e053a2c63611721ae0ecf3c42b16a383199 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 15 Sep 2021 10:44:55 +0200
Subject: [PATCH 15/62] Add small correction for task_distill.py

---
 TinyBERT/task_distill.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py
index 39628a38..5c2f3fa1 100644
--- a/TinyBERT/task_distill.py
+++ b/TinyBERT/task_distill.py
@@ -61,6 +61,7 @@
 def result_to_file(result, file_name):
     with open(file_name, "a") as writer:
         logger.info("***** Eval results *****")
+        writer.write("")
         for key in sorted(result.keys()):
             logger.info("  %s = %s", key, str(result[key]))
             writer.write("%s = %s\n" % (key, str(result[key])))
@@ -428,7 +429,8 @@ def soft_cross_entropy(predicts, targets):
                     optimizer.zero_grad()
                     global_step += 1
 
-                if (global_step + 1) % args.eval_step == 0:
+                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 0 or \
+                        (global_step + 1) == num_train_optimization_steps:
                     logger.info("***** Running evaluation *****")
                     logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))
                     logger.info("  Num examples = %d", len(eval_examples))

From 008b180bfb25c73554ecd89563a8be829e5f4fa0 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 15 Sep 2021 12:14:21 +0200
Subject: [PATCH 16/62] Add minor corrects

---
 TinyBERT/fine_tune_bert.py | 16 ++++++++--------
 TinyBERT/task_distill.py   |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py
index 4b7a9881..e6707fcb 100644
--- a/TinyBERT/fine_tune_bert.py
+++ b/TinyBERT/fine_tune_bert.py
@@ -195,13 +195,13 @@ def main():
 
     # intermediate distillation default parameters
     default_params = {
-        "cola": {"num_train_epochs": 5, "max_seq_length": 64},
-        "mnli": {"num_train_epochs": 5, "max_seq_length": 128},
-        "mrpc": {"num_train_epochs": 5, "max_seq_length": 128},
-        "sst-2": {"num_train_epochs": 5, "max_seq_length": 64},
-        "sts-b": {"num_train_epochs": 5, "max_seq_length": 128},
-        "qqp": {"num_train_epochs": 5, "max_seq_length": 128},
-        "qnli": {"num_train_epochs": 5, "max_seq_length": 128},
+        "cola": {"num_train_epochs": 3, "max_seq_length": 64},
+        "mnli": {"num_train_epochs": 3, "max_seq_length": 128},
+        "mrpc": {"num_train_epochs": 3, "max_seq_length": 128},
+        "sst-2": {"num_train_epochs":3, "max_seq_length": 64},
+        "sts-b": {"num_train_epochs": 3, "max_seq_length": 128},
+        "qqp": {"num_train_epochs": 3, "max_seq_length": 128},
+        "qnli": {"num_train_epochs": 3, "max_seq_length": 128},
         "rte": {"num_train_epochs": 5, "max_seq_length": 128}
     }
 
@@ -368,7 +368,7 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 0 or \
+                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 2 or \
                         (global_step + 1) == num_train_optimization_steps:
                     logger.info("***** Running evaluation *****")
                     logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))
diff --git a/TinyBERT/task_distill.py b/TinyBERT/task_distill.py
index 5c2f3fa1..2e6b9b6b 100644
--- a/TinyBERT/task_distill.py
+++ b/TinyBERT/task_distill.py
@@ -429,7 +429,7 @@ def soft_cross_entropy(predicts, targets):
                     optimizer.zero_grad()
                     global_step += 1
 
-                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 0 or \
+                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 2 or \
                         (global_step + 1) == num_train_optimization_steps:
                     logger.info("***** Running evaluation *****")
                     logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))

From 0996bde9961ce793736fe4bb18483ead77ae010f Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 21 Sep 2021 11:52:00 +0200
Subject: [PATCH 17/62] feat: add script counting number of parameters and MACs
 of model

---
 TinyBERT/model_statistics.py | 90 ++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 TinyBERT/model_statistics.py

diff --git a/TinyBERT/model_statistics.py b/TinyBERT/model_statistics.py
new file mode 100644
index 00000000..75f3884f
--- /dev/null
+++ b/TinyBERT/model_statistics.py
@@ -0,0 +1,90 @@
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import math
+import sys
+
+import torch
+from thop import profile
+
+from data_processing import processors
+from transformer.modeling import TinyBertForSequenceClassification
+from transformer.tokenization import BertTokenizer
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt='%m/%d %I:%M:%S %p')
+fh = logging.FileHandler('debug_layer_loss.log')
+fh.setFormatter(logging.Formatter(log_format))
+logging.getLogger().addHandler(fh)
+logger = logging.getLogger()
+
+
+def print_results(macs, params, title=''):
+    if len(title) != 0:
+        print("- " + title)
+    print(f"\tmacs [G]: {macs / math.pow(10, 9):.2f}, params [M]: {params / math.pow(10, 6):.2f}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model",
+                        default=None,
+                        type=str,
+                        help="The anlised model dir.")
+    parser.add_argument("--task_name",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The name of the task to train.")
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    args = parser.parse_args()
+    logger.info('The args: {}'.format(args))
+
+    # Prepare devices
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    n_gpu = torch.cuda.device_count()
+
+    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt='%m/%d/%Y %H:%M:%S',
+                        level=logging.INFO)
+
+    logger.info("device: {} n_gpu: {}".format(device, n_gpu))
+
+    task_name = args.task_name.lower()
+
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % task_name)
+
+    processor = processors[task_name]()
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case)
+
+    model = TinyBertForSequenceClassification.from_pretrained(args.model, num_labels=num_labels)
+    model.to(device)
+
+    model_input = tuple([torch.randint(high=len(tokenizer.vocab),
+                                       size=(1, args.max_seq_length), dtype=torch.int64, device=device),
+                         torch.randint(high=1, size=(1, args.max_seq_length), dtype=torch.int64, device=device),
+                         torch.randint(high=1, size=(1, args.max_seq_length), dtype=torch.int64, device=device)])
+
+    macs, params = profile(model, inputs=model_input)
+
+    print("Results")
+    print_results(macs, params)
+
+
+if __name__ == "__main__":
+    main()

From b6c971664fd66e30eeca05772b62c7bc0a4c28e4 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 21 Sep 2021 11:59:38 +0200
Subject: [PATCH 18/62] feat: update requirements.txt

---
 TinyBERT/requirements.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/TinyBERT/requirements.txt b/TinyBERT/requirements.txt
index 5f5389e8..2a155cdd 100644
--- a/TinyBERT/requirements.txt
+++ b/TinyBERT/requirements.txt
@@ -7,4 +7,6 @@ requests
 
 torch>=1.0.1
 scipy>=0.14.0
-seaborn
\ No newline at end of file
+seaborn
+
+thop

From 4b93a392a8e4b01340a7122c61116b45bdcb6cbd Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 21 Sep 2021 12:01:35 +0200
Subject: [PATCH 19/62] fix: add no_cuda argument

---
 TinyBERT/model_statistics.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/TinyBERT/model_statistics.py b/TinyBERT/model_statistics.py
index 75f3884f..a26b6252 100644
--- a/TinyBERT/model_statistics.py
+++ b/TinyBERT/model_statistics.py
@@ -47,6 +47,9 @@ def main():
     parser.add_argument("--do_lower_case",
                         action='store_true',
                         help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
 
     args = parser.parse_args()
     logger.info('The args: {}'.format(args))

From 222d8f1a89647792a97e1f4bd408a38e303bc5cf Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 14:06:24 +0100
Subject: [PATCH 20/62] feat: small correct in logging during finetuning

---
 TinyBERT/fine_tune_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TinyBERT/fine_tune_bert.py b/TinyBERT/fine_tune_bert.py
index e6707fcb..4a848775 100644
--- a/TinyBERT/fine_tune_bert.py
+++ b/TinyBERT/fine_tune_bert.py
@@ -63,7 +63,7 @@ def result_to_file(result, file_name):
         writer.write("")
         logger.info("***** Eval results *****")
         for key in sorted(result.keys()):
-            logger.info("  %s = %s", key, str(result[key]))
+            logger.info("%s = %s", key, str(result[key]))
             writer.write("%s = %s\n" % (key, str(result[key])))
 
 

From 7f3a20d420ff45be3b977fb5988a08600d519fb0 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 16:25:56 +0100
Subject: [PATCH 21/62] feat: add scripts for multiemo

---
 TernaryBERT/download_bert_base.py       |  31 ++
 TernaryBERT/fine_tune_bert.py           | 427 ++++++++++++++++++++++++
 TernaryBERT/quant_task_glue.py          | 167 ++++-----
 TernaryBERT/quant_task_polemo.py        | 422 +++++++++++++++++++++++
 TernaryBERT/scripts/download_dataset.py |  54 +++
 TernaryBERT/utils_glue.py               |  28 +-
 TernaryBERT/utils_multiemo.py           | 226 +++++++++++++
 7 files changed, 1262 insertions(+), 93 deletions(-)
 create mode 100644 TernaryBERT/download_bert_base.py
 create mode 100644 TernaryBERT/fine_tune_bert.py
 create mode 100644 TernaryBERT/quant_task_polemo.py
 create mode 100644 TernaryBERT/scripts/download_dataset.py
 create mode 100644 TernaryBERT/utils_multiemo.py

diff --git a/TernaryBERT/download_bert_base.py b/TernaryBERT/download_bert_base.py
new file mode 100644
index 00000000..fa99e41a
--- /dev/null
+++ b/TernaryBERT/download_bert_base.py
@@ -0,0 +1,31 @@
+import os
+import requests
+import tarfile
+
+url = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz'
+
+output_path = os.path.join('data', 'models')
+os.makedirs(output_path, exist_ok=True)
+
+output_tar = os.path.join(output_path, 'bert-base-uncased.tar.gz')
+model_folder = os.path.join(output_path, 'bert-base-uncased')
+
+response = requests.get(url, stream=True)
+if response.status_code == 200:
+    with open(output_tar, 'wb') as f:
+        f.write(response.raw.read())
+
+with tarfile.open(name=output_tar, mode="r|gz") as tar_ref:
+    tar_ref.extractall(model_folder)
+
+os.rename(os.path.join(model_folder, 'bert_config.json'), os.path.join(model_folder, 'config.json'))
+
+os.remove(output_tar)
+
+url_vocab = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt'
+r = requests.get(url_vocab)
+
+with open(os.path.join(model_folder, 'vocab.txt'), 'wb') as f:
+    f.write(r.content)
+
+print('Completed!')
diff --git a/TernaryBERT/fine_tune_bert.py b/TernaryBERT/fine_tune_bert.py
new file mode 100644
index 00000000..e2c1d1ee
--- /dev/null
+++ b/TernaryBERT/fine_tune_bert.py
@@ -0,0 +1,427 @@
+# coding=utf-8
+# 2019.12.2-Changed for TinyBERT task-specific distillation
+#      Huawei Technologies Co., Ltd. <yinyichun@huawei.com>
+# Copyright 2020 Huawei Technologies Co., Ltd.
+# Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import csv
+import logging
+import os
+import random
+import sys
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler)
+from tqdm import tqdm, trange
+
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from utils_multiemo import *
+from transformer.modeling import BertForSequenceClassification
+from transformer.tokenization import BertTokenizer
+from transformer.optimization import BertAdam
+from transformer.file_utils import WEIGHTS_NAME, CONFIG_NAME
+
+csv.field_size_limit(sys.maxsize)
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt='%m/%d %I:%M:%S %p')
+fh = logging.FileHandler('debug_layer_loss.log')
+fh.setFormatter(logging.Formatter(log_format))
+logging.getLogger().addHandler(fh)
+logger = logging.getLogger()
+
+
+def result_to_file(result, file_name):
+    with open(file_name, "a") as writer:
+        writer.write("")
+        logger.info("***** Eval results *****")
+        for key in sorted(result.keys()):
+            logger.info("%s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+def get_tensor_data(output_mode, features):
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths)
+    return tensor_data, all_label_ids
+
+
+def do_eval(model, task_name, eval_dataloader,
+            device, output_mode, eval_labels, num_labels):
+    eval_loss = 0
+    nb_eval_steps = 0
+    preds = []
+
+    for batch_ in tqdm(eval_dataloader, desc="Evaluating"):
+        batch_ = tuple(t.to(device) for t in batch_)
+        with torch.no_grad():
+            input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_
+
+            logits, _, _ = model(input_ids, segment_ids, input_mask)
+
+        # create eval loss and other metric required by the task
+        if output_mode == "classification":
+            loss_fct = CrossEntropyLoss()
+            tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
+        elif output_mode == "regression":
+            loss_fct = MSELoss()
+            tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
+
+        eval_loss += tmp_eval_loss.mean().item()
+        nb_eval_steps += 1
+        if len(preds) == 0:
+            preds.append(logits.detach().cpu().numpy())
+        else:
+            preds[0] = np.append(
+                preds[0], logits.detach().cpu().numpy(), axis=0)
+
+    eval_loss = eval_loss / nb_eval_steps
+
+    preds = preds[0]
+    if output_mode == "classification":
+        preds = np.argmax(preds, axis=1)
+    elif output_mode == "regression":
+        preds = np.squeeze(preds)
+    result = compute_metrics(task_name, preds, eval_labels.numpy())
+    result['eval_loss'] = eval_loss
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--pretrained_model",
+                        default=None,
+                        type=str,
+                        help="The pretrained model dir.")
+    parser.add_argument("--task_name",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The name of the task to train.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_eval",
+                        action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument('--weight_decay', '--wd',
+                        default=1e-4,
+                        type=float,
+                        metavar='W',
+                        help='weight decay')
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.1,
+                        type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+
+    # added arguments
+    parser.add_argument('--aug_train',
+                        action='store_true')
+    parser.add_argument('--eval_step',
+                        type=int,
+                        default=50)
+
+    args = parser.parse_args()
+    logger.info('The args: {}'.format(args))
+
+    # intermediate distillation default parameters
+    default_params = {
+        "multiemo": {"num_train_epochs": 3, "max_seq_length": 128},
+        "cola": {"num_train_epochs": 3, "max_seq_length": 64},
+        "mnli": {"num_train_epochs": 3, "max_seq_length": 128},
+        "mrpc": {"num_train_epochs": 3, "max_seq_length": 128},
+        "sst-2": {"num_train_epochs": 3, "max_seq_length": 64},
+        "sts-b": {"num_train_epochs": 3, "max_seq_length": 128},
+        "qqp": {"num_train_epochs": 3, "max_seq_length": 128},
+        "qnli": {"num_train_epochs": 3, "max_seq_length": 128},
+        "rte": {"num_train_epochs": 5, "max_seq_length": 128}
+    }
+
+    acc_tasks = ["multiemo", "mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
+    corr_tasks = ["sts-b"]
+    mcc_tasks = ["cola"]
+
+    # Prepare devices
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    n_gpu = torch.cuda.device_count()
+
+    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt='%m/%d/%Y %H:%M:%S',
+                        level=logging.INFO)
+
+    logger.info("device: {} n_gpu: {}".format(device, n_gpu))
+
+    # Prepare seed
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    # Prepare task settings
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    task_name = args.task_name.lower()
+
+    if task_name in default_params:
+        args.max_seq_len = default_params[task_name]["max_seq_length"]
+
+    if not args.do_eval:
+        if task_name in default_params:
+            args.num_train_epoch = default_params[task_name]["num_train_epochs"]
+
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % task_name)
+
+    processor = processors[task_name]()
+    output_mode = output_modes[task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    tokenizer = BertTokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case)
+
+    if not args.do_eval:
+        if not args.aug_train:
+            train_examples = processor.get_train_examples(args.data_dir)
+        else:
+            train_examples = processor.get_aug_examples(args.data_dir)
+        if args.gradient_accumulation_steps < 1:
+            raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                args.gradient_accumulation_steps))
+
+        args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+        num_train_optimization_steps = int(
+            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+
+        train_features = convert_examples_to_features(train_examples, label_list,
+                                                      args.max_seq_length, tokenizer, output_mode)
+        train_data, _ = get_tensor_data(output_mode, train_features)
+        train_sampler = RandomSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    eval_examples = processor.get_dev_examples(args.data_dir)
+    eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+    eval_data, eval_labels = get_tensor_data(output_mode, eval_features)
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels)
+    model.to(device)
+
+    if args.do_eval:
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_examples))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+
+        model.eval()
+        result = do_eval(model, task_name, eval_dataloader,
+                         device, output_mode, eval_labels, num_labels)
+        logger.info("***** Eval results *****")
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+    else:
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+        if n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+        # Prepare optimizer
+        param_optimizer = list(model.named_parameters())
+        size = 0
+        for n, p in model.named_parameters():
+            logger.info('n: {}'.format(n))
+            size += p.nelement()
+
+        logger.info('Total parameters: {}'.format(size))
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+        schedule = 'warmup_linear'
+
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                             schedule=schedule,
+                             lr=args.learning_rate,
+                             warmup=args.warmup_proportion,
+                             t_total=num_train_optimization_steps)
+
+        # Train and evaluate
+        global_step = 0
+        best_dev_acc = 0.0
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+
+        for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0.
+            tr_cls_loss = 0.
+
+            model.train()
+            nb_tr_examples, nb_tr_steps = 0, 0
+
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", ascii=True)):
+                batch = tuple(t.to(device) for t in batch)
+
+                input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch
+                if input_ids.size()[0] != args.train_batch_size:
+                    continue
+
+                cls_loss = 0.
+
+                logits, _, _ = model(input_ids, segment_ids, input_mask)
+
+                if output_mode == "classification":
+                    loss_fct = CrossEntropyLoss()
+                    cls_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
+                elif output_mode == "regression":
+                    loss_mse = MSELoss()
+                    cls_loss = loss_mse(logits.view(-1), label_ids.view(-1))
+
+                loss = cls_loss
+                tr_cls_loss += cls_loss.item()
+
+                if n_gpu > 1:
+                    loss = loss.mean()  # mean() to average on multi-gpu.
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                loss.backward()
+
+                tr_loss += loss.item()
+                nb_tr_examples += label_ids.size(0)
+                nb_tr_steps += 1
+
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+
+                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 2 or \
+                        (global_step + 1) == num_train_optimization_steps:
+                    logger.info("***** Running evaluation *****")
+                    logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))
+                    logger.info("  Num examples = %d", len(eval_examples))
+                    logger.info("  Batch size = %d", args.eval_batch_size)
+
+                    model.eval()
+
+                    loss = tr_loss / (step + 1)
+                    cls_loss = tr_cls_loss / (step + 1)
+
+                    result = do_eval(model, task_name, eval_dataloader,
+                                     device, output_mode, eval_labels, num_labels)
+                    result['global_step'] = global_step
+                    result['cls_loss'] = cls_loss
+                    result['loss'] = loss
+
+                    result_to_file(result, output_eval_file)
+
+                    save_model = False
+
+                    if task_name in acc_tasks and result['acc'] > best_dev_acc:
+                        best_dev_acc = result['acc']
+                        save_model = True
+
+                    if task_name in corr_tasks and result['corr'] > best_dev_acc:
+                        best_dev_acc = result['corr']
+                        save_model = True
+
+                    if task_name in mcc_tasks and result['mcc'] > best_dev_acc:
+                        best_dev_acc = result['mcc']
+                        save_model = True
+
+                    if save_model:
+                        logger.info("***** Save model *****")
+                        model_to_save = model.module if hasattr(model, 'module') else model
+
+                        model_name = WEIGHTS_NAME
+
+                        output_model_file = os.path.join(args.output_dir, model_name)
+                        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+                        torch.save(model_to_save.state_dict(), output_model_file)
+                        model_to_save.config.to_json_file(output_config_file)
+                        tokenizer.save_vocabulary(args.output_dir)
+
+                    model.train()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/TernaryBERT/quant_task_glue.py b/TernaryBERT/quant_task_glue.py
index 1356da24..a4740e62 100644
--- a/TernaryBERT/quant_task_glue.py
+++ b/TernaryBERT/quant_task_glue.py
@@ -10,11 +10,11 @@
 
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.tensorboard import SummaryWriter
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from transformer import BertForSequenceClassification,WEIGHTS_NAME, CONFIG_NAME
+from transformer import BertForSequenceClassification, WEIGHTS_NAME, CONFIG_NAME
 from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification
 from transformer import BertTokenizer
 from transformer import BertAdam
@@ -26,6 +26,7 @@
                     format=log_format, datefmt='%m/%d %I:%M:%S %p')
 logger = logging.getLogger()
 
+
 def get_tensor_data(output_mode, features):
     if output_mode == "classification":
         all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
@@ -36,16 +37,17 @@ def get_tensor_data(output_mode, features):
     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
     all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
-    tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_label_ids, all_seq_lengths)
+    tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths)
     return tensor_data, all_label_ids
 
+
 def do_eval(model, task_name, eval_dataloader,
             device, output_mode, eval_labels, num_labels):
     eval_loss = 0
     nb_eval_steps = 0
     preds = []
 
-    for _,batch_ in enumerate(eval_dataloader):
+    for _, batch_ in enumerate(eval_dataloader):
         batch_ = tuple(t.to(device) for t in batch_)
         with torch.no_grad():
             input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_
@@ -78,11 +80,13 @@ def do_eval(model, task_name, eval_dataloader,
     result['eval_loss'] = eval_loss
     return result
 
+
 def soft_cross_entropy(predicts, targets):
     student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1)
     targets_prob = torch.nn.functional.softmax(targets, dim=-1)
     return (- targets_prob * student_likelihood).mean()
 
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--data_dir",
@@ -109,7 +113,7 @@ def main():
                         default='output',
                         type=str,
                         help="The output directory where the model predictions and checkpoints will be written.")
-    
+
     parser.add_argument("--learning_rate",
                         default=2e-5,
                         type=float,
@@ -122,7 +126,7 @@ def main():
                         type=int,
                         default=42,
                         help="random seed for initialization")
-    
+
     parser.add_argument('--aug_train',
                         action='store_false',
                         help="Whether to use augmented data or not")
@@ -142,7 +146,7 @@ def main():
     parser.add_argument("--weight_bits",
                         default=2,
                         type=int,
-                        choices=[2,8],
+                        choices=[2, 8],
                         help="Quantization bits for weight.")
     parser.add_argument("--input_bits",
                         default=8,
@@ -158,17 +162,17 @@ def main():
     summaryWriter = SummaryWriter(args.output_dir)
     logger.info('The args: {}'.format(args))
     task_name = args.task_name.lower()
-    data_dir = os.path.join(args.data_dir,task_name)
-    output_dir = os.path.join(args.output_dir,task_name)
+    data_dir = os.path.join(args.data_dir, task_name)
+    output_dir = os.path.join(args.output_dir, task_name)
     # processed_data_dir = os.path.join(args.data_dir,'preprocessed',task_name)
 
     if not os.path.exists(output_dir):
         os.mkdir(output_dir)
-    
+
     if args.student_model is None:
-        args.student_model = os.path.join(args.model_dir,task_name)
+        args.student_model = os.path.join(args.model_dir, task_name)
     if args.teacher_model is None:
-        args.teacher_model = os.path.join(args.model_dir,task_name)
+        args.teacher_model = os.path.join(args.model_dir, task_name)
 
     processors = {
         "cola": ColaProcessor,
@@ -194,14 +198,14 @@ def main():
     }
 
     default_params = {
-        "cola": {"max_seq_length": 64,"batch_size":16,"eval_step":50},
-        "mnli": {"max_seq_length": 128,"batch_size":32,"eval_step":1000},
-        "mrpc": {"max_seq_length": 128,"batch_size":32,"eval_step":200},
-        "sst-2": {"max_seq_length": 64,"batch_size":32,"eval_step":200},
-        "sts-b": {"max_seq_length": 128,"batch_size":32,"eval_step":50},
-        "qqp": {"max_seq_length": 128,"batch_size":32,"eval_step":1000},
-        "qnli": {"max_seq_length": 128,"batch_size":32,"eval_step":1000},
-        "rte": {"max_seq_length": 128,"batch_size":32,"eval_step":100}
+        "cola": {"max_seq_length": 64, "batch_size": 16, "eval_step": 50},
+        "mnli": {"max_seq_length": 128, "batch_size": 32, "eval_step": 1000},
+        "mrpc": {"max_seq_length": 128, "batch_size": 32, "eval_step": 200},
+        "sst-2": {"max_seq_length": 64, "batch_size": 32, "eval_step": 200},
+        "sts-b": {"max_seq_length": 128, "batch_size": 32, "eval_step": 50},
+        "qqp": {"max_seq_length": 128, "batch_size": 32, "eval_step": 1000},
+        "qnli": {"max_seq_length": 128, "batch_size": 32, "eval_step": 1000},
+        "rte": {"max_seq_length": 128, "batch_size": 32, "eval_step": 100}
     }
 
     acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
@@ -218,11 +222,11 @@ def main():
     torch.manual_seed(args.seed)
     if n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
- 
+
     if task_name in default_params:
         args.batch_size = default_params[task_name]["batch_size"]
         if n_gpu > 0:
-            args.batch_size = int(args.batch_size*n_gpu)
+            args.batch_size = int(args.batch_size * n_gpu)
         args.max_seq_length = default_params[task_name]["max_seq_length"]
         args.eval_step = default_params[task_name]["eval_step"]
 
@@ -232,35 +236,36 @@ def main():
     num_labels = len(label_list)
 
     tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=True)
-    
+
     if args.aug_train:
         try:
-            train_file = os.path.join(processed_data_dir,'aug_data')
-            train_features = pickle.load(open(train_file,'rb'))
+            train_file = os.path.join(processed_data_dir, 'aug_data')
+            train_features = pickle.load(open(train_file, 'rb'))
         except:
             train_examples = processor.get_aug_examples(data_dir)
             train_features = convert_examples_to_features(train_examples, label_list,
-                                            args.max_seq_length, tokenizer, output_mode)
+                                                          args.max_seq_length, tokenizer, output_mode)
     else:
         try:
-            train_file = os.path.join(processed_data_dir,'train_data')
-            train_features = pickle.load(open(train_file,'rb'))
+            train_file = os.path.join(processed_data_dir, 'train_data')
+            train_features = pickle.load(open(train_file, 'rb'))
         except:
             train_examples = processor.get_train_examples(data_dir)
             train_features = convert_examples_to_features(train_examples, label_list,
-                                            args.max_seq_length, tokenizer, output_mode)
+                                                          args.max_seq_length, tokenizer, output_mode)
 
     num_train_optimization_steps = int(len(train_features) / args.batch_size) * args.num_train_epochs
     train_data, _ = get_tensor_data(output_mode, train_features)
     train_sampler = RandomSampler(train_data)
     train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size)
-    
+
     try:
-        dev_file = train_file = os.path.join(processed_data_dir,'dev_data')
-        eval_features = pickle.load(open(dev_file,'rb'))
+        dev_file = train_file = os.path.join(processed_data_dir, 'dev_data')
+        eval_features = pickle.load(open(dev_file, 'rb'))
     except:
         eval_examples = processor.get_dev_examples(data_dir)
-        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer,
+                                                     output_mode)
 
     eval_data, eval_labels = get_tensor_data(output_mode, eval_features)
     eval_sampler = SequentialSampler(eval_data)
@@ -268,13 +273,13 @@ def main():
     if task_name == "mnli":
         processor = processors["mnli-mm"]()
         try:
-            dev_mm_file = train_file = os.path.join(processed_data_dir,'dev-mm_data')
-            mm_eval_features = pickle.load(open(dev_mm_file,'rb'))
+            dev_mm_file = train_file = os.path.join(processed_data_dir, 'dev-mm_data')
+            mm_eval_features = pickle.load(open(dev_mm_file, 'rb'))
         except:
             mm_eval_examples = processor.get_dev_examples(data_dir)
             mm_eval_features = convert_examples_to_features(
                 mm_eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-        
+
         mm_eval_data, mm_eval_labels = get_tensor_data(output_mode, mm_eval_features)
         logger.info("  Num examples = %d", len(mm_eval_features))
 
@@ -289,11 +294,11 @@ def main():
         teacher_model = torch.nn.DataParallel(teacher_model)
 
     result = do_eval(teacher_model, task_name, eval_dataloader,
-                    device, output_mode, eval_labels, num_labels)
+                     device, output_mode, eval_labels, num_labels)
     if task_name in acc_tasks:
-        if task_name in ['sst-2','mnli','qnli','rte']:
+        if task_name in ['sst-2', 'mnli', 'qnli', 'rte']:
             fp32_performance = f"acc:{result['acc']}"
-        elif task_name in ['mrpc','qqp']:
+        elif task_name in ['mrpc', 'qqp']:
             fp32_performance = f"f1/acc:{result['f1']}/{result['acc']}"
     if task_name in corr_tasks:
         fp32_performance = f"pearson/spearmanr:{result['pearson']}/{result['spearmanr']}"
@@ -303,15 +308,16 @@ def main():
 
     if task_name == "mnli":
         result = do_eval(teacher_model, 'mnli-mm', mm_eval_dataloader,
-                            device, output_mode, mm_eval_labels, num_labels)
+                         device, output_mode, mm_eval_labels, num_labels)
         fp32_performance += f"  mm-acc:{result['acc']}"
-    fp32_performance = task_name +' fp32   ' + fp32_performance
-    student_config = BertConfig.from_pretrained(args.teacher_model, 
+    fp32_performance = task_name + ' fp32   ' + fp32_performance
+    student_config = BertConfig.from_pretrained(args.teacher_model,
                                                 quantize_act=True,
-                                                weight_bits = args.weight_bits,
-                                                input_bits = args.input_bits,
-                                                clip_val = args.clip_val)
-    student_model = QuantBertForSequenceClassification.from_pretrained(args.student_model, config = student_config, num_labels=num_labels)
+                                                weight_bits=args.weight_bits,
+                                                input_bits=args.input_bits,
+                                                clip_val=args.clip_val)
+    student_model = QuantBertForSequenceClassification.from_pretrained(args.student_model, config=student_config,
+                                                                       num_labels=num_labels)
     student_model.to(device)
 
     logger.info("***** Running training *****")
@@ -320,7 +326,7 @@ def main():
     logger.info("  Num steps = %d", num_train_optimization_steps)
     if n_gpu > 1:
         student_model = torch.nn.DataParallel(student_model)
-        
+
     # Prepare optimizer
     param_optimizer = list(student_model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
@@ -330,15 +336,15 @@ def main():
     ]
     schedule = 'warmup_linear'
     optimizer = BertAdam(optimizer_grouped_parameters,
-                            schedule=schedule,
-                            lr=args.learning_rate,
-                            warmup=0.1,
-                            t_total=num_train_optimization_steps)
+                         schedule=schedule,
+                         lr=args.learning_rate,
+                         warmup=0.1,
+                         t_total=num_train_optimization_steps)
     loss_mse = MSELoss()
     global_step = 0
     best_dev_acc = 0.0
     previous_best = None
-    
+
     tr_loss = 0.
     tr_att_loss = 0.
     tr_rep_loss = 0.
@@ -359,10 +365,10 @@ def main():
 
             with torch.no_grad():
                 teacher_logits, teacher_atts, teacher_reps = teacher_model(input_ids, segment_ids, input_mask)
-            
+
             if args.pred_distill:
                 if output_mode == "classification":
-                    cls_loss = soft_cross_entropy(student_logits,teacher_logits)
+                    cls_loss = soft_cross_entropy(student_logits, teacher_logits)
                 elif output_mode == "regression":
                     cls_loss = loss_mse(student_logits, teacher_logits)
 
@@ -372,9 +378,9 @@ def main():
             if args.intermediate_distill:
                 for student_att, teacher_att in zip(student_atts, teacher_atts):
                     student_att = torch.where(student_att <= -1e2, torch.zeros_like(student_att).to(device),
-                                                student_att)
+                                              student_att)
                     teacher_att = torch.where(teacher_att <= -1e2, torch.zeros_like(teacher_att).to(device),
-                                                teacher_att)
+                                              teacher_att)
                     tmp_loss = loss_mse(student_att, teacher_att)
                     att_loss += tmp_loss
 
@@ -397,7 +403,7 @@ def main():
             tr_loss += loss.item()
             nb_tr_examples += label_ids.size(0)
             nb_tr_steps += 1
-            if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps-1:
+            if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps - 1:
                 logger.info("***** Running evaluation *****")
                 logger.info("  {} step of {} steps".format(global_step, num_train_optimization_steps))
                 if previous_best is not None:
@@ -411,34 +417,34 @@ def main():
                 rep_loss = tr_rep_loss / (step + 1)
 
                 result = do_eval(student_model, task_name, eval_dataloader,
-                                    device, output_mode, eval_labels, num_labels)
+                                 device, output_mode, eval_labels, num_labels)
                 result['global_step'] = global_step
                 result['cls_loss'] = cls_loss
                 result['att_loss'] = att_loss
                 result['rep_loss'] = rep_loss
                 result['loss'] = loss
-                summaryWriter.add_scalar('total_loss',loss,global_step)
-                summaryWriter.add_scalars('distill_loss',{'att_loss':att_loss,
-                                            'rep_loss':rep_loss,
-                                            'cls_loss':cls_loss},global_step)
-
-                if task_name=='cola':
-                    summaryWriter.add_scalar('mcc',result['mcc'],global_step)
-                elif task_name in ['sst-2','mnli','mnli-mm','qnli','rte','wnli']:
-                    summaryWriter.add_scalar('acc',result['acc'],global_step)
-                elif task_name in ['mrpc','qqp']:
-                    summaryWriter.add_scalars('performance',{'acc':result['acc'],
-                                                'f1':result['f1'],
-                                                'acc_and_f1':result['acc_and_f1']},global_step)
+                summaryWriter.add_scalar('total_loss', loss, global_step)
+                summaryWriter.add_scalars('distill_loss', {'att_loss': att_loss,
+                                                           'rep_loss': rep_loss,
+                                                           'cls_loss': cls_loss}, global_step)
+
+                if task_name == 'cola':
+                    summaryWriter.add_scalar('mcc', result['mcc'], global_step)
+                elif task_name in ['sst-2', 'mnli', 'mnli-mm', 'qnli', 'rte', 'wnli']:
+                    summaryWriter.add_scalar('acc', result['acc'], global_step)
+                elif task_name in ['mrpc', 'qqp']:
+                    summaryWriter.add_scalars('performance', {'acc': result['acc'],
+                                                              'f1': result['f1'],
+                                                              'acc_and_f1': result['acc_and_f1']}, global_step)
                 else:
-                    summaryWriter.add_scalar('corr',result['corr'],global_step)
+                    summaryWriter.add_scalar('corr', result['corr'], global_step)
 
                 save_model = False
 
                 if task_name in acc_tasks and result['acc'] > best_dev_acc:
-                    if task_name in ['sst-2','mnli','qnli','rte']:
+                    if task_name in ['sst-2', 'mnli', 'qnli', 'rte']:
                         previous_best = f"acc:{result['acc']}"
-                    elif task_name in ['mrpc','qqp']:
+                    elif task_name in ['mrpc', 'qqp']:
                         previous_best = f"f1/acc:{result['f1']}/{result['acc']}"
                     best_dev_acc = result['acc']
                     save_model = True
@@ -457,8 +463,8 @@ def main():
                     # Test mnli-mm
                     if task_name == "mnli":
                         result = do_eval(student_model, 'mnli-mm', mm_eval_dataloader,
-                                            device, output_mode, mm_eval_labels, num_labels)
-                        previous_best+= f"mm-acc:{result['acc']}"
+                                         device, output_mode, mm_eval_labels, num_labels)
+                        previous_best += f"mm-acc:{result['acc']}"
                     logger.info(fp32_performance)
                     logger.info(previous_best)
                     if args.save_fp_model:
@@ -478,10 +484,11 @@ def main():
                         model_to_save = student_model.module if hasattr(student_model, 'module') else student_model
                         quant_model = copy.deepcopy(model_to_save)
                         for name, module in quant_model.named_modules():
-                            if hasattr(module,'weight_quantizer'):
-                                module.weight.data = module.weight_quantizer.apply(module.weight,module.weight_clip_val,
-                                                                             module.weight_bits,True)
-                                
+                            if hasattr(module, 'weight_quantizer'):
+                                module.weight.data = module.weight_quantizer.apply(module.weight,
+                                                                                   module.weight_clip_val,
+                                                                                   module.weight_bits, True)
+
                         output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME)
                         output_config_file = os.path.join(output_quant_dir, CONFIG_NAME)
 
diff --git a/TernaryBERT/quant_task_polemo.py b/TernaryBERT/quant_task_polemo.py
new file mode 100644
index 00000000..ca07bfe3
--- /dev/null
+++ b/TernaryBERT/quant_task_polemo.py
@@ -0,0 +1,422 @@
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import random
+import sys
+import pickle
+import copy
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from torch.utils.tensorboard import SummaryWriter
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from transformer import BertForSequenceClassification, WEIGHTS_NAME, CONFIG_NAME
+from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification
+from transformer import BertTokenizer
+from transformer import BertAdam
+from transformer import BertConfig
+from utils_multiemo import *
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt='%m/%d %I:%M:%S %p')
+logger = logging.getLogger()
+
+
+def get_tensor_data(output_mode, features):
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths)
+    return tensor_data, all_label_ids
+
+
+def do_eval(model, task_name, eval_dataloader,
+            device, output_mode, eval_labels, num_labels):
+    eval_loss = 0
+    nb_eval_steps = 0
+    preds = []
+
+    for _, batch_ in enumerate(eval_dataloader):
+        batch_ = tuple(t.to(device) for t in batch_)
+        with torch.no_grad():
+            input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_
+            logits, _, _ = model(input_ids, segment_ids, input_mask)
+
+        # create eval loss and other metric required by the task
+        if output_mode == "classification":
+            loss_fct = CrossEntropyLoss()
+            tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
+        elif output_mode == "regression":
+            loss_fct = MSELoss()
+            tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
+
+        eval_loss += tmp_eval_loss.mean().item()
+        nb_eval_steps += 1
+        if len(preds) == 0:
+            preds.append(logits.detach().cpu().numpy())
+        else:
+            preds[0] = np.append(
+                preds[0], logits.detach().cpu().numpy(), axis=0)
+
+    eval_loss = eval_loss / nb_eval_steps
+
+    preds = preds[0]
+    if output_mode == "classification":
+        preds = np.argmax(preds, axis=1)
+    elif output_mode == "regression":
+        preds = np.squeeze(preds)
+    result = compute_metrics(task_name, preds, eval_labels.numpy())
+    result['eval_loss'] = eval_loss
+    return result
+
+
+def soft_cross_entropy(predicts, targets):
+    student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1)
+    targets_prob = torch.nn.functional.softmax(targets, dim=-1)
+    return (- targets_prob * student_likelihood).mean()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir",
+                        default='data',
+                        type=str,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_dir",
+                        default='models/tinybert',
+                        type=str,
+                        help="The model dir.")
+    parser.add_argument("--teacher_model",
+                        default=None,
+                        type=str,
+                        help="The models directory.")
+    parser.add_argument("--student_model",
+                        default=None,
+                        type=str,
+                        help="The models directory.")
+    parser.add_argument("--task_name",
+                        default='sst-2',
+                        type=str,
+                        help="The name of the task to train.")
+    parser.add_argument("--output_dir",
+                        default='output',
+                        type=str,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    parser.add_argument("--learning_rate",
+                        default=2e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument('--aug_train',
+                        action='store_false',
+                        help="Whether to use augmented data or not")
+    parser.add_argument('--pred_distill',
+                        action='store_true',
+                        help="Whether to distil with task layer")
+    parser.add_argument('--intermediate_distill',
+                        action='store_true',
+                        help="Whether to distil with intermediate layers")
+    parser.add_argument('--save_fp_model',
+                        action='store_true',
+                        help="Whether to save fp32 model")
+    parser.add_argument('--save_quantized_model',
+                        action='store_true',
+                        help="Whether to save quantized model")
+
+    parser.add_argument("--weight_bits",
+                        default=2,
+                        type=int,
+                        choices=[2, 8],
+                        help="Quantization bits for weight.")
+    parser.add_argument("--input_bits",
+                        default=8,
+                        type=int,
+                        help="Quantization bits for activation.")
+    parser.add_argument("--clip_val",
+                        default=2.5,
+                        type=float,
+                        help="Initial clip value.")
+
+    args = parser.parse_args()
+    assert args.pred_distill or args.intermediate_distill, "'pred_distill' and 'intermediate_distill', at least one must be True"
+    summaryWriter = SummaryWriter(args.output_dir)
+    logger.info('The args: {}'.format(args))
+    task_name = args.task_name.lower()
+    data_dir = os.path.join(args.data_dir, task_name)
+    output_dir = os.path.join(args.output_dir, task_name)
+    # processed_data_dir = os.path.join(args.data_dir,'preprocessed',task_name)
+
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+
+    if args.student_model is None:
+        args.student_model = os.path.join(args.model_dir, task_name)
+    if args.teacher_model is None:
+        args.teacher_model = os.path.join(args.model_dir, task_name)
+
+    processors = {
+        "multiemo": MultiemoProcessor
+    }
+
+    output_modes = {
+        "multiemo": "classification"
+    }
+
+    default_params = {
+        "multiemo": {"max_seq_length": 128, "batch_size": 16, "eval_step": 50}
+    }
+
+    acc_tasks = ["multiemo"]
+
+    # Prepare devices
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+
+    # Prepare seed
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if task_name in default_params:
+        args.batch_size = default_params[task_name]["batch_size"]
+        if n_gpu > 0:
+            args.batch_size = int(args.batch_size * n_gpu)
+        args.max_seq_length = default_params[task_name]["max_seq_length"]
+        args.eval_step = default_params[task_name]["eval_step"]
+
+    if 'multiemo' in task_name:
+        _, lang, domain, kind = task_name.split('_')
+        processor = MultiemoProcessor(lang, domain, kind)
+    else:
+        raise ValueError("Task not found: %s" % task_name)
+
+    if 'multiemo' in task_name:
+        output_mode = output_modes['multiemo']
+    else:
+        raise ValueError("Task not found: %s" % task_name)
+
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=True)
+
+    if args.aug_train:
+        train_examples = processor.get_aug_examples(data_dir)
+        train_features = convert_examples_to_features(train_examples, label_list,
+                                                      args.max_seq_length, tokenizer, output_mode)
+    else:
+        train_examples = processor.get_train_examples(data_dir)
+        train_features = convert_examples_to_features(train_examples, label_list,
+                                                      args.max_seq_length, tokenizer, output_mode)
+
+    num_train_optimization_steps = int(len(train_features) / args.batch_size) * args.num_train_epochs
+    train_data, _ = get_tensor_data(output_mode, train_features)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size)
+
+    eval_examples = processor.get_dev_examples(data_dir)
+    eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer,
+                                                 output_mode)
+
+    eval_data, eval_labels = get_tensor_data(output_mode, eval_features)
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+
+    teacher_model = BertForSequenceClassification.from_pretrained(args.teacher_model)
+    teacher_model.to(device)
+    teacher_model.eval()
+    if n_gpu > 1:
+        teacher_model = torch.nn.DataParallel(teacher_model)
+
+    result = do_eval(teacher_model, task_name, eval_dataloader,
+                     device, output_mode, eval_labels, num_labels)
+    fp32_performance = f"f1/acc:{result['f1']}/{result['acc']}"
+    fp32_performance = task_name + ' fp32   ' + fp32_performance
+
+    student_config = BertConfig.from_pretrained(
+        args.teacher_model,
+        quantize_act=True,
+        weight_bits=args.weight_bits,
+        input_bits=args.input_bits,
+        clip_val=args.clip_val
+    )
+    student_model = QuantBertForSequenceClassification.from_pretrained(args.student_model, config=student_config,
+                                                                       num_labels=num_labels)
+    student_model.to(device)
+
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_features))
+    logger.info("  Batch size = %d", args.batch_size)
+    logger.info("  Num steps = %d", num_train_optimization_steps)
+    if n_gpu > 1:
+        student_model = torch.nn.DataParallel(student_model)
+
+    # Prepare optimizer
+    param_optimizer = list(student_model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+    ]
+    schedule = 'warmup_linear'
+    optimizer = BertAdam(optimizer_grouped_parameters,
+                         schedule=schedule,
+                         lr=args.learning_rate,
+                         warmup=0.1,
+                         t_total=num_train_optimization_steps)
+    loss_mse = MSELoss()
+    global_step = 0
+    best_dev_acc = 0.0
+    previous_best = None
+
+    tr_loss = 0.
+    tr_att_loss = 0.
+    tr_rep_loss = 0.
+    tr_cls_loss = 0.
+    for epoch_ in range(int(args.num_train_epochs)):
+        nb_tr_examples, nb_tr_steps = 0, 0
+
+        for step, batch in enumerate(train_dataloader):
+            student_model.train()
+            batch = tuple(t.to(device) for t in batch)
+            input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch
+            att_loss = 0.
+            rep_loss = 0.
+            cls_loss = 0.
+            loss = 0.
+
+            student_logits, student_atts, student_reps = student_model(input_ids, segment_ids, input_mask)
+
+            with torch.no_grad():
+                teacher_logits, teacher_atts, teacher_reps = teacher_model(input_ids, segment_ids, input_mask)
+
+            if args.pred_distill:
+                if output_mode == "classification":
+                    cls_loss = soft_cross_entropy(student_logits, teacher_logits)
+                elif output_mode == "regression":
+                    cls_loss = loss_mse(student_logits, teacher_logits)
+
+                loss = cls_loss
+                tr_cls_loss += cls_loss.item()
+
+            if args.intermediate_distill:
+                for student_att, teacher_att in zip(student_atts, teacher_atts):
+                    student_att = torch.where(student_att <= -1e2, torch.zeros_like(student_att).to(device),
+                                              student_att)
+                    teacher_att = torch.where(teacher_att <= -1e2, torch.zeros_like(teacher_att).to(device),
+                                              teacher_att)
+                    tmp_loss = loss_mse(student_att, teacher_att)
+                    att_loss += tmp_loss
+
+                for student_rep, teacher_rep in zip(student_reps, teacher_reps):
+                    tmp_loss = loss_mse(student_rep, teacher_rep)
+                    rep_loss += tmp_loss
+
+                loss += rep_loss + att_loss
+                tr_att_loss += att_loss.item()
+                tr_rep_loss += rep_loss.item()
+
+            if n_gpu > 1:
+                loss = loss.mean()
+
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            global_step += 1
+
+            tr_loss += loss.item()
+            nb_tr_examples += label_ids.size(0)
+            nb_tr_steps += 1
+            if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps - 1:
+                logger.info("***** Running evaluation *****")
+                logger.info("  {} step of {} steps".format(global_step, num_train_optimization_steps))
+                if previous_best is not None:
+                    logger.info(f"{fp32_performance}\nPrevious best = {previous_best}")
+
+                student_model.eval()
+
+                loss = tr_loss / (step + 1)
+                cls_loss = tr_cls_loss / (step + 1)
+                att_loss = tr_att_loss / (step + 1)
+                rep_loss = tr_rep_loss / (step + 1)
+
+                result = do_eval(student_model, task_name, eval_dataloader,
+                                 device, output_mode, eval_labels, num_labels)
+                result['global_step'] = global_step
+                result['cls_loss'] = cls_loss
+                result['att_loss'] = att_loss
+                result['rep_loss'] = rep_loss
+                result['loss'] = loss
+                summaryWriter.add_scalar('total_loss', loss, global_step)
+                summaryWriter.add_scalars('distill_loss', {'att_loss': att_loss,
+                                                           'rep_loss': rep_loss,
+                                                           'cls_loss': cls_loss}, global_step)
+
+                summaryWriter.add_scalars('performance', {'acc': result['acc'],
+                                                          'f1': result['f1'],
+                                                          'acc_and_f1': result['acc_and_f1']}, global_step)
+
+                save_model = False
+
+                if task_name in acc_tasks and result['acc'] > best_dev_acc:
+                    previous_best = f"f1/acc:{result['f1']}/{result['acc']}"
+                    best_dev_acc = result['acc']
+                    save_model = True
+
+                if save_model:
+                    logger.info(fp32_performance)
+                    logger.info(previous_best)
+                    if args.save_fp_model:
+                        logger.info("******************** Save full precision model ********************")
+                        model_to_save = student_model.module if hasattr(student_model, 'module') else student_model
+                        output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
+                        output_config_file = os.path.join(output_dir, CONFIG_NAME)
+
+                        torch.save(model_to_save.state_dict(), output_model_file)
+                        model_to_save.config.to_json_file(output_config_file)
+                        tokenizer.save_vocabulary(output_dir)
+                    if args.save_quantized_model:
+                        logger.info("******************** Save quantized model ********************")
+                        output_quant_dir = os.path.join(output_dir, 'quant')
+                        if not os.path.exists(output_quant_dir):
+                            os.makedirs(output_quant_dir)
+                        model_to_save = student_model.module if hasattr(student_model, 'module') else student_model
+                        quant_model = copy.deepcopy(model_to_save)
+                        for name, module in quant_model.named_modules():
+                            if hasattr(module, 'weight_quantizer'):
+                                module.weight.data = module.weight_quantizer.apply(module.weight,
+                                                                                   module.weight_clip_val,
+                                                                                   module.weight_bits, True)
+
+                        output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME)
+                        output_config_file = os.path.join(output_quant_dir, CONFIG_NAME)
+
+                        torch.save(quant_model.state_dict(), output_model_file)
+                        model_to_save.config.to_json_file(output_config_file)
+                        tokenizer.save_vocabulary(output_quant_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/TernaryBERT/scripts/download_dataset.py b/TernaryBERT/scripts/download_dataset.py
new file mode 100644
index 00000000..de38d570
--- /dev/null
+++ b/TernaryBERT/scripts/download_dataset.py
@@ -0,0 +1,54 @@
+import os
+import zipfile
+
+import requests
+from tqdm.auto import tqdm
+
+# url = 'https://clarin-pl.eu/dspace/bitstream/handle/11321/798/multiemo.zip?sequence=2&isAllowed=y'
+url = 'https://clarin-pl.eu/dspace/handle/11321/798/allzip'
+
+
+def main(data_dir):
+    output_zip = os.path.join(
+        data_dir,
+        'MultiEmo_ Multilingual, Multilevel, Multidomain Sentiment Analysis Corpus of Consumer Reviews.zip')
+
+    response = requests.get(url, stream=True)
+
+    if response.status_code == 200:
+        total_size_in_bytes = int(response.headers.get('content-length', 0))
+        block_size = 1024
+        progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+        with open(output_zip, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=block_size):
+                if chunk:
+                    progress_bar.update(len(chunk))
+                    f.write(chunk)
+
+        progress_bar.close()
+        if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+            print("ERROR, something went wrong")
+
+    with zipfile.ZipFile(output_zip, "r") as zip_ref:
+        zip_ref.extractall(data_dir)
+
+    os.remove(output_zip)
+    os.remove(os.path.join(data_dir, 'multiemo.7z'))
+
+    data_output_zip = os.path.join(data_dir, 'multiemo.zip')
+    with zipfile.ZipFile(data_output_zip, "r") as zip_ref:
+        zip_ref.extractall(data_dir)
+
+    os.remove(data_output_zip)
+    os.remove(os.path.join(data_dir, 'README.txt'))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
+    args = parser.parse_args()
+
+    if not os.path.isdir(args.data_dir):
+        os.mkdir(args.data_dir)
+
+    main(data_dir=args.data_dir)
diff --git a/TernaryBERT/utils_glue.py b/TernaryBERT/utils_glue.py
index 5a33219f..c19c4108 100644
--- a/TernaryBERT/utils_glue.py
+++ b/TernaryBERT/utils_glue.py
@@ -8,6 +8,7 @@
 
 logger = logging.getLogger()
 
+
 class InputExample(object):
     """A single training/test example for simple sequence classification."""
 
@@ -50,7 +51,7 @@ def get_train_examples(self, data_dir):
     def get_dev_examples(self, data_dir):
         """Gets a collection of `InputExample`s for the dev set."""
         raise NotImplementedError()
-        
+
     def get_test_examples(self, data_dir):
         """Gets a collection of `InputExample`s for the test set."""
         raise NotImplementedError()
@@ -84,7 +85,7 @@ def get_dev_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    
+
     def get_test_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
@@ -128,7 +129,7 @@ def get_dev_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
             "dev_matched")
-    
+
     def get_test_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
@@ -167,6 +168,7 @@ def get_dev_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
             "dev_matched")
+
     def get_test_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
@@ -186,7 +188,7 @@ def get_dev_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    
+
     def get_test_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
@@ -233,7 +235,7 @@ def get_dev_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    
+
     def get_test_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
@@ -276,7 +278,7 @@ def get_dev_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    
+
     def get_test_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
@@ -298,7 +300,7 @@ def _create_examples(self, lines, set_type):
             guid = "%s-%s" % (set_type, line[0])
             text_a = line[7]
             text_b = line[8]
-            if set_type== 'test':
+            if set_type == 'test':
                 label = None
             else:
                 label = line[-1]
@@ -319,7 +321,7 @@ def get_dev_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    
+
     def get_test_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
@@ -340,7 +342,7 @@ def _create_examples(self, lines, set_type):
                 continue
             guid = "%s-%s" % (set_type, line[0])
             try:
-                if set_type=='test':
+                if set_type == 'test':
                     text_a = line[1]
                     text_b = line[2]
                     label = None
@@ -368,7 +370,7 @@ def get_dev_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")),
             "dev_matched")
-    
+
     def get_test_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
@@ -388,7 +390,7 @@ def _create_examples(self, lines, set_type):
             if i == 0:
                 continue
             guid = "%s-%s" % (set_type, line[0])
-            if set_type=='test':
+            if set_type == 'test':
                 text_a = line[1]
                 text_b = line[2]
                 label = None
@@ -413,7 +415,7 @@ def get_dev_examples(self, data_dir):
         """See base class."""
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    
+
     def get_test_examples(self, data_dir):
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
@@ -433,7 +435,7 @@ def _create_examples(self, lines, set_type):
             if i == 0:
                 continue
             guid = "%s-%s" % (set_type, line[0])
-            if set_type=='test':
+            if set_type == 'test':
                 text_a = line[1]
                 text_b = line[2]
                 label = None
diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py
new file mode 100644
index 00000000..dcec6cb8
--- /dev/null
+++ b/TernaryBERT/utils_multiemo.py
@@ -0,0 +1,226 @@
+import os
+import logging
+import sys
+import csv
+
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import matthews_corrcoef, f1_score
+
+logger = logging.getLogger()
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.seq_length = seq_length
+        self.label_id = label_id
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the test set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+
+class MultiemoProcessor(DataProcessor):
+    """Processor for the Multiemo data2 set"""
+
+    def __init__(self, lang: str, domain: str, kind: str):
+        super(MultiemoProcessor, self).__init__()
+        self.lang = lang.lower()
+        self.domain = domain.lower()
+        self.kind = kind.lower()
+
+    def get_train_examples(self, data_dir: str) -> List[InputExample]:
+        """See base class."""
+        file_path = self.get_set_type_path(data_dir, 'train')
+        logger.info(f"LOOKING AT {file_path}")
+        return self._create_examples(self._read_txt(file_path), "train")
+
+    def get_dev_examples(self, data_dir: str) -> List[InputExample]:
+        """See base class."""
+        file_path = self.get_set_type_path(data_dir, 'dev')
+        return self._create_examples(self._read_txt(file_path), "dev")
+
+    def get_test_examples(self, data_dir: str) -> List[InputExample]:
+        """See base class."""
+        file_path = self.get_set_type_path(data_dir, 'test')
+        return self._create_examples(self._read_txt(file_path), "test")
+
+    def get_set_type_path(self, data_dir: str, set_type: str) -> str:
+        return os.path.join(data_dir, self.domain + '.' + self.kind + '.' + set_type + '.' + self.lang + '.txt')
+
+    def get_labels(self) -> List[str]:
+        """See base class."""
+        if self.kind == 'text':
+            return ["meta_amb", "meta_minus_m", "meta_plus_m", "meta_zero"]
+        else:
+            return ["z_amb", "z_minus_m", "z_plus_m", "z_zero"]
+
+    @staticmethod
+    def _create_examples(lines: List[str], set_type: str) -> List[InputExample]:
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            split_line = line.split('__label__')
+            text_a = split_line[0]
+            label = split_line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer, output_mode):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[:(max_seq_length - 2)]
+
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        input_mask = [1] * len(input_ids)
+        seq_length = len(input_ids)
+
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        try:
+            if output_mode == "classification":
+                label_id = label_map[example.label]
+            elif output_mode == "regression":
+                label_id = float(example.label)
+            else:
+                raise KeyError(output_mode)
+        except:
+            label_id = 0
+
+        if ex_index < 1:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join(
+                [str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info(
+                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: {}".format(example.label))
+            logger.info("label_id: {}".format(label_id))
+
+        features.append(
+            InputFeatures(input_ids=input_ids,
+                          input_mask=input_mask,
+                          segment_ids=segment_ids,
+                          label_id=label_id,
+                          seq_length=seq_length))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def compute_metrics(task_name, preds, labels):
+    assert len(preds) == len(labels)
+    if task_name == "multiemo":
+        return acc_and_f1(preds, labels)
+    else:
+        raise KeyError(task_name)

From 26328e0f832d64b724fe2315efdecd746e9865e6 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 19:08:50 +0100
Subject: [PATCH 22/62] feat: add evaluation on test set after training

---
 ...une_bert.py => multiemo_fine_tune_bert.py} | 106 +++++++++++------
 ..._task_polemo.py => quant_task_multiemo.py} | 107 +++++++++++++-----
 TernaryBERT/scripts/download_dataset.py       |   1 +
 TernaryBERT/utils.py                          |  39 +++++++
 TernaryBERT/utils_multiemo.py                 |   4 +-
 5 files changed, 191 insertions(+), 66 deletions(-)
 rename TernaryBERT/{fine_tune_bert.py => multiemo_fine_tune_bert.py} (83%)
 rename TernaryBERT/{quant_task_polemo.py => quant_task_multiemo.py} (83%)
 create mode 100644 TernaryBERT/utils.py

diff --git a/TernaryBERT/fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py
similarity index 83%
rename from TernaryBERT/fine_tune_bert.py
rename to TernaryBERT/multiemo_fine_tune_bert.py
index e2c1d1ee..451c7091 100644
--- a/TernaryBERT/fine_tune_bert.py
+++ b/TernaryBERT/multiemo_fine_tune_bert.py
@@ -29,11 +29,13 @@
 
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler)
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
 from tqdm import tqdm, trange
 
 from torch.nn import CrossEntropyLoss, MSELoss
+from sklearn.metrics import classification_report
 
+from utils import result_to_text_file
 from utils_multiemo import *
 from transformer.modeling import BertForSequenceClassification
 from transformer.tokenization import BertTokenizer
@@ -51,15 +53,6 @@
 logger = logging.getLogger()
 
 
-def result_to_file(result, file_name):
-    with open(file_name, "a") as writer:
-        writer.write("")
-        logger.info("***** Eval results *****")
-        for key in sorted(result.keys()):
-            logger.info("%s = %s", key, str(result[key]))
-            writer.write("%s = %s\n" % (key, str(result[key])))
-
-
 def get_tensor_data(output_mode, features):
     if output_mode == "classification":
         all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
@@ -78,13 +71,12 @@ def do_eval(model, task_name, eval_dataloader,
             device, output_mode, eval_labels, num_labels):
     eval_loss = 0
     nb_eval_steps = 0
-    preds = []
+    all_logits = None
 
-    for batch_ in tqdm(eval_dataloader, desc="Evaluating"):
+    for _, batch_ in enumerate(eval_dataloader):
         batch_ = tuple(t.to(device) for t in batch_)
         with torch.no_grad():
             input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_
-
             logits, _, _ = model(input_ids, segment_ids, input_mask)
 
         # create eval loss and other metric required by the task
@@ -97,23 +89,19 @@ def do_eval(model, task_name, eval_dataloader,
 
         eval_loss += tmp_eval_loss.mean().item()
         nb_eval_steps += 1
-        if len(preds) == 0:
-            preds.append(logits.detach().cpu().numpy())
+
+        if all_logits is None:
+            all_logits = logits.detach().cpu().numpy()
         else:
-            preds[0] = np.append(
-                preds[0], logits.detach().cpu().numpy(), axis=0)
+            all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0)
 
     eval_loss = eval_loss / nb_eval_steps
 
-    preds = preds[0]
-    if output_mode == "classification":
-        preds = np.argmax(preds, axis=1)
-    elif output_mode == "regression":
-        preds = np.squeeze(preds)
-    result = compute_metrics(task_name, preds, eval_labels.numpy())
+    if output_mode == "regression":
+        all_logits = np.squeeze(all_logits)
+    result = compute_metrics(task_name, all_logits, eval_labels.numpy())
     result['eval_loss'] = eval_loss
-
-    return result
+    return result, all_logits
 
 
 def main():
@@ -246,11 +234,17 @@ def main():
         if task_name in default_params:
             args.num_train_epoch = default_params[task_name]["num_train_epochs"]
 
-    if task_name not in processors:
+    if 'multiemo' in task_name:
+        _, lang, domain, kind = task_name.split('_')
+        processor = MultiemoProcessor(lang, domain, kind)
+    else:
+        raise ValueError("Task not found: %s" % task_name)
+
+    if 'multiemo' in task_name:
+        output_mode = 'classification'
+    else:
         raise ValueError("Task not found: %s" % task_name)
 
-    processor = processors[task_name]()
-    output_mode = output_modes[task_name]
     label_list = processor.get_labels()
     num_labels = len(label_list)
 
@@ -284,15 +278,14 @@ def main():
 
     model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels)
     model.to(device)
-
     if args.do_eval:
         logger.info("***** Running evaluation *****")
         logger.info("  Num examples = %d", len(eval_examples))
         logger.info("  Batch size = %d", args.eval_batch_size)
 
         model.eval()
-        result = do_eval(model, task_name, eval_dataloader,
-                         device, output_mode, eval_labels, num_labels)
+        result, _ = do_eval(model, task_name, eval_dataloader,
+                            device, output_mode, eval_labels, num_labels)
         logger.info("***** Eval results *****")
         for key in sorted(result.keys()):
             logger.info("  %s = %s", key, str(result[key]))
@@ -385,13 +378,13 @@ def main():
                     loss = tr_loss / (step + 1)
                     cls_loss = tr_cls_loss / (step + 1)
 
-                    result = do_eval(model, task_name, eval_dataloader,
-                                     device, output_mode, eval_labels, num_labels)
+                    result, _ = do_eval(model, task_name, eval_dataloader,
+                                        device, output_mode, eval_labels, num_labels)
                     result['global_step'] = global_step
                     result['cls_loss'] = cls_loss
                     result['loss'] = loss
 
-                    result_to_file(result, output_eval_file)
+                    result_to_text_file(result, output_eval_file)
 
                     save_model = False
 
@@ -422,6 +415,51 @@ def main():
 
                     model.train()
 
+        # Measure End Time
+        training_end_time = time.monotonic()
+
+        diff = timedelta(seconds=training_end_time - training_start_time)
+        diff_seconds = diff.total_seconds()
+
+        training_parameters = vars(args)
+        training_parameters['training_time'] = diff_seconds
+
+        output_training_params_file = os.path.join(args.output_dir, "training_params.json")
+        dictionary_to_json(training_parameters, output_training_params_file)
+
+        #########################
+        #       Test model      #
+        #########################
+        test_examples = processor.get_test_examples(args.data_dir)
+        test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer,
+                                                     output_mode)
+
+        test_data, test_labels = get_tensor_data(output_mode, test_features)
+        test_sampler = SequentialSampler(eval_data)
+        test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+
+        logger.info("\n***** Running evaluation on test dataset *****")
+        logger.info("  Num examples = %d", len(test_features))
+        logger.info("  Batch size = %d", args.batch_size)
+
+        eval_start_time = time.monotonic()
+        result, y_logits = do_eval(model, task_name, test_dataloader,
+                                   device, output_mode, test_labels, num_labels)
+        eval_end_time = time.monotonic()
+
+        diff = timedelta(seconds=eval_end_time - eval_start_time)
+        diff_seconds = diff.total_seconds()
+        result['eval_time'] = diff_seconds
+        result_to_text_file(result, os.path.join(args.output_dir, "test_results.txt"))
+
+        y_pred = np.argmax(y_logits, axis=1)
+        print('\n\t**** Classification report ****\n')
+        print(classification_report(y_true, y_pred, target_names=label_list))
+
+        report = classification_report(y_true, y_pred, target_names=label_list, output_dict=True)
+        report['eval_time'] = diff_seconds
+        dictionary_to_json(report, os.path.join(args.output_dir, "test_results.json"))
+
 
 if __name__ == "__main__":
     main()
diff --git a/TernaryBERT/quant_task_polemo.py b/TernaryBERT/quant_task_multiemo.py
similarity index 83%
rename from TernaryBERT/quant_task_polemo.py
rename to TernaryBERT/quant_task_multiemo.py
index ca07bfe3..0561cda1 100644
--- a/TernaryBERT/quant_task_polemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -7,12 +7,15 @@
 import sys
 import pickle
 import copy
+import time
+from datetime import timedelta
 
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.tensorboard import SummaryWriter
 from torch.nn import CrossEntropyLoss, MSELoss
+from sklearn.metrics import classification_report
+from tqdm import trange
 
 from transformer import BertForSequenceClassification, WEIGHTS_NAME, CONFIG_NAME
 from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification
@@ -20,6 +23,7 @@
 from transformer import BertAdam
 from transformer import BertConfig
 from utils_multiemo import *
+from utils import dictionary_to_json, result_to_text_file
 
 log_format = '%(asctime)s %(message)s'
 logging.basicConfig(stream=sys.stdout, level=logging.INFO,
@@ -45,7 +49,7 @@ def do_eval(model, task_name, eval_dataloader,
             device, output_mode, eval_labels, num_labels):
     eval_loss = 0
     nb_eval_steps = 0
-    preds = []
+    all_logits = None
 
     for _, batch_ in enumerate(eval_dataloader):
         batch_ = tuple(t.to(device) for t in batch_)
@@ -63,22 +67,19 @@ def do_eval(model, task_name, eval_dataloader,
 
         eval_loss += tmp_eval_loss.mean().item()
         nb_eval_steps += 1
-        if len(preds) == 0:
-            preds.append(logits.detach().cpu().numpy())
+
+        if all_logits is None:
+            all_logits = logits.detach().cpu().numpy()
         else:
-            preds[0] = np.append(
-                preds[0], logits.detach().cpu().numpy(), axis=0)
+            all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0)
 
     eval_loss = eval_loss / nb_eval_steps
 
-    preds = preds[0]
-    if output_mode == "classification":
-        preds = np.argmax(preds, axis=1)
-    elif output_mode == "regression":
-        preds = np.squeeze(preds)
-    result = compute_metrics(task_name, preds, eval_labels.numpy())
+    if output_mode == "regression":
+        all_logits = np.squeeze(all_logits)
+    result = compute_metrics(task_name, all_logits, eval_labels.numpy())
     result['eval_loss'] = eval_loss
-    return result
+    return result, all_logits
 
 
 def soft_cross_entropy(predicts, targets):
@@ -106,7 +107,6 @@ def main():
                         type=str,
                         help="The models directory.")
     parser.add_argument("--task_name",
-                        default='sst-2',
                         type=str,
                         help="The name of the task to train.")
     parser.add_argument("--output_dir",
@@ -159,10 +159,9 @@ def main():
 
     args = parser.parse_args()
     assert args.pred_distill or args.intermediate_distill, "'pred_distill' and 'intermediate_distill', at least one must be True"
-    summaryWriter = SummaryWriter(args.output_dir)
     logger.info('The args: {}'.format(args))
     task_name = args.task_name.lower()
-    data_dir = os.path.join(args.data_dir, task_name)
+    data_dir = os.path.join(args.data_dir)
     output_dir = os.path.join(args.output_dir, task_name)
     # processed_data_dir = os.path.join(args.data_dir,'preprocessed',task_name)
 
@@ -250,8 +249,9 @@ def main():
     if n_gpu > 1:
         teacher_model = torch.nn.DataParallel(teacher_model)
 
-    result = do_eval(teacher_model, task_name, eval_dataloader,
-                     device, output_mode, eval_labels, num_labels)
+    result, _ = do_eval(teacher_model, task_name, eval_dataloader,
+                        device, output_mode, eval_labels, num_labels)
+
     fp32_performance = f"f1/acc:{result['f1']}/{result['acc']}"
     fp32_performance = task_name + ' fp32   ' + fp32_performance
 
@@ -266,6 +266,8 @@ def main():
                                                                        num_labels=num_labels)
     student_model.to(device)
 
+    training_start_time = time.monotonic()
+
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_features))
     logger.info("  Batch size = %d", args.batch_size)
@@ -290,12 +292,13 @@ def main():
     global_step = 0
     best_dev_acc = 0.0
     previous_best = None
+    output_eval_file = os.path.join(output_dir, "eval_results.txt")
 
     tr_loss = 0.
     tr_att_loss = 0.
     tr_rep_loss = 0.
     tr_cls_loss = 0.
-    for epoch_ in range(int(args.num_train_epochs)):
+    for epoch_ in trange(int(args.num_train_epochs)):
         nb_tr_examples, nb_tr_steps = 0, 0
 
         for step, batch in enumerate(train_dataloader):
@@ -362,21 +365,16 @@ def main():
                 att_loss = tr_att_loss / (step + 1)
                 rep_loss = tr_rep_loss / (step + 1)
 
-                result = do_eval(student_model, task_name, eval_dataloader,
-                                 device, output_mode, eval_labels, num_labels)
+                result, _ = do_eval(student_model, task_name, eval_dataloader,
+                                    device, output_mode, eval_labels, num_labels)
+
                 result['global_step'] = global_step
                 result['cls_loss'] = cls_loss
                 result['att_loss'] = att_loss
                 result['rep_loss'] = rep_loss
                 result['loss'] = loss
-                summaryWriter.add_scalar('total_loss', loss, global_step)
-                summaryWriter.add_scalars('distill_loss', {'att_loss': att_loss,
-                                                           'rep_loss': rep_loss,
-                                                           'cls_loss': cls_loss}, global_step)
 
-                summaryWriter.add_scalars('performance', {'acc': result['acc'],
-                                                          'f1': result['f1'],
-                                                          'acc_and_f1': result['acc_and_f1']}, global_step)
+                result_to_text_file(result, output_eval_file)
 
                 save_model = False
 
@@ -406,9 +404,11 @@ def main():
                         quant_model = copy.deepcopy(model_to_save)
                         for name, module in quant_model.named_modules():
                             if hasattr(module, 'weight_quantizer'):
-                                module.weight.data = module.weight_quantizer.apply(module.weight,
-                                                                                   module.weight_clip_val,
-                                                                                   module.weight_bits, True)
+                                module.weight.data = module.weight_quantizer.apply(
+                                    module.weight,
+                                    module.weight_clip_val,
+                                    module.weight_bits, True
+                                )
 
                         output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME)
                         output_config_file = os.path.join(output_quant_dir, CONFIG_NAME)
@@ -417,6 +417,51 @@ def main():
                         model_to_save.config.to_json_file(output_config_file)
                         tokenizer.save_vocabulary(output_quant_dir)
 
+    # Measure End Time
+    training_end_time = time.monotonic()
+
+    diff = timedelta(seconds=training_end_time - training_start_time)
+    diff_seconds = diff.total_seconds()
+
+    training_parameters = vars(args)
+    training_parameters['training_time'] = diff_seconds
+
+    output_training_params_file = os.path.join(output_dir, "training_params.json")
+    dictionary_to_json(training_parameters, output_training_params_file)
+
+    #########################
+    #       Test model      #
+    #########################
+    test_examples = processor.get_test_examples(data_dir)
+    test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer,
+                                                 output_mode)
+
+    test_data, test_labels = get_tensor_data(output_mode, test_features)
+    test_sampler = SequentialSampler(eval_data)
+    test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+
+    logger.info("\n***** Running evaluation on test dataset *****")
+    logger.info("  Num examples = %d", len(test_features))
+    logger.info("  Batch size = %d", args.batch_size)
+
+    eval_start_time = time.monotonic()
+    result, y_logits = do_eval(student_model, task_name, test_dataloader,
+                               device, output_mode, test_labels, num_labels)
+    eval_end_time = time.monotonic()
+
+    diff = timedelta(seconds=eval_end_time - eval_start_time)
+    diff_seconds = diff.total_seconds()
+    result['eval_time'] = diff_seconds
+    result_to_text_file(result, os.path.join(output_dir, "test_results.txt"))
+
+    y_pred = np.argmax(y_logits, axis=1)
+    print('\n\t**** Classification report ****\n')
+    print(classification_report(test_labels.numpy(), y_pred, target_names=label_list))
+
+    report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True)
+    report['eval_time'] = diff_seconds
+    dictionary_to_json(report, os.path.join(output_dir, "test_results.json"))
+
 
 if __name__ == "__main__":
     main()
diff --git a/TernaryBERT/scripts/download_dataset.py b/TernaryBERT/scripts/download_dataset.py
index de38d570..08bdc6fb 100644
--- a/TernaryBERT/scripts/download_dataset.py
+++ b/TernaryBERT/scripts/download_dataset.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 import zipfile
 
diff --git a/TernaryBERT/utils.py b/TernaryBERT/utils.py
new file mode 100644
index 00000000..307ea9b2
--- /dev/null
+++ b/TernaryBERT/utils.py
@@ -0,0 +1,39 @@
+import json
+import logging
+import os
+import sys
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt='%d/%m/%Y %H:%M:%S')
+logger = logging.getLogger(__name__)
+
+
+def result_to_text_file(result: dict, file_name: str, verbose: bool = True) -> None:
+    with open(file_name, "a") as writer:
+        if verbose:
+            logger.info("***** Eval results *****")
+
+        for key in sorted(result.keys()):
+            if verbose:
+                logger.info(" %s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+
+        writer.write("")
+
+
+def dictionary_to_json(dictionary: dict, file_name: str):
+    with open(file_name, "w") as f:
+        json.dump(dictionary, f, indent=2)
+
+
+def is_folder_empty(folder_name: str):
+    if len([f for f in os.listdir(folder_name) if not f.startswith('.')]) == 0:
+        return True
+    else:
+        return False
+
+
+def get_immediate_subdirectories(directory: str):
+    return [os.path.join(directory, name) for name in os.listdir(directory)
+            if os.path.isdir(os.path.join(directory, name))]
\ No newline at end of file
diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py
index dcec6cb8..b20e8f61 100644
--- a/TernaryBERT/utils_multiemo.py
+++ b/TernaryBERT/utils_multiemo.py
@@ -3,6 +3,7 @@
 import sys
 import csv
 
+import numpy as np
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import matthews_corrcoef, f1_score
 
@@ -218,7 +219,8 @@ def acc_and_f1(preds, labels):
     }
 
 
-def compute_metrics(task_name, preds, labels):
+def compute_metrics(task_name, logits, labels):
+    preds = np.argmax(logits, axis=1)
     assert len(preds) == len(labels)
     if task_name == "multiemo":
         return acc_and_f1(preds, labels)

From e549623f988ceaf9c2cf59b514ce44bd369b1abb Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 19:10:15 +0100
Subject: [PATCH 23/62] feat: add Dockerfile

---
 TernaryBERT/Dockerfile       | 25 +++++++++++++++++++++++++
 TernaryBERT/requirements.txt |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 TernaryBERT/Dockerfile

diff --git a/TernaryBERT/Dockerfile b/TernaryBERT/Dockerfile
new file mode 100644
index 00000000..81616e69
--- /dev/null
+++ b/TernaryBERT/Dockerfile
@@ -0,0 +1,25 @@
+FROM nvidia/cuda:11.2.0-cudnn8-runtime-ubuntu20.04
+
+ENV TZ=Europe/Minsk
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+RUN apt update && \
+    apt install --no-install-recommends -y build-essential software-properties-common && \
+    apt install --no-install-recommends -y python3.8.12 python3-pip python3-dev python3-setuptools python3-distutils && \
+    apt clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+RUN python3.8 -m pip install --upgrade pip && \
+    python3.8 -m pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 \
+    -f https://download.pytorch.org/whl/torch_stable.html
+
+COPY ./requirements.txt .
+RUN python3.8 -m pip install --no-cache-dir -r requirements.txt
+RUN rm requirements.txt
+
+ARG USER_ID
+ARG GROUP_ID
+
+RUN addgroup --gid $GROUP_ID user
+RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user
+USER user
diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt
index 7bbcdf0a..f86b0709 100644
--- a/TernaryBERT/requirements.txt
+++ b/TernaryBERT/requirements.txt
@@ -4,4 +4,4 @@ scipy
 future
 Pillow
 tensorflow==1.14.0
-torch==1.1.0
+# torch==1.1.0

From d94594abc746a1b1ecc5b5610075f5fd05372cc5 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 19:47:57 +0100
Subject: [PATCH 24/62] feat: add experiments script

---
 TernaryBERT/multiemo_fine_tune_bert.py  | 15 +++--
 TernaryBERT/quant_task_multiemo.py      | 30 ++++++---
 TernaryBERT/run_experiments.py          | 85 +++++++++++++++++++++++++
 TernaryBERT/scripts/download_dataset.py |  2 +-
 TernaryBERT/utils_multiemo.py           |  1 +
 5 files changed, 118 insertions(+), 15 deletions(-)
 create mode 100644 TernaryBERT/run_experiments.py

diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py
index 451c7091..a41b4d69 100644
--- a/TernaryBERT/multiemo_fine_tune_bert.py
+++ b/TernaryBERT/multiemo_fine_tune_bert.py
@@ -26,6 +26,8 @@
 import os
 import random
 import sys
+import time
+from datetime import timedelta
 
 import numpy as np
 import torch
@@ -35,7 +37,7 @@
 from torch.nn import CrossEntropyLoss, MSELoss
 from sklearn.metrics import classification_report
 
-from utils import result_to_text_file
+from utils import result_to_text_file, dictionary_to_json
 from utils_multiemo import *
 from transformer.modeling import BertForSequenceClassification
 from transformer.tokenization import BertTokenizer
@@ -150,7 +152,7 @@ def main():
                         type=float,
                         help="The initial learning rate for Adam.")
     parser.add_argument('--weight_decay', '--wd',
-                        default=1e-4,
+                        default=0.01,
                         type=float,
                         metavar='W',
                         help='weight decay')
@@ -290,6 +292,8 @@ def main():
         for key in sorted(result.keys()):
             logger.info("  %s = %s", key, str(result[key]))
     else:
+        training_start_time = time.monotonic()
+
         logger.info("***** Running training *****")
         logger.info("  Num examples = %d", len(train_examples))
         logger.info("  Batch size = %d", args.train_batch_size)
@@ -306,7 +310,8 @@ def main():
         logger.info('Total parameters: {}'.format(size))
         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
         optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+             'weight_decay': args.weight_decay},
             {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
         ]
         schedule = 'warmup_linear'
@@ -454,9 +459,9 @@ def main():
 
         y_pred = np.argmax(y_logits, axis=1)
         print('\n\t**** Classification report ****\n')
-        print(classification_report(y_true, y_pred, target_names=label_list))
+        print(classification_report(test_labels.numpy(), y_pred, target_names=label_list))
 
-        report = classification_report(y_true, y_pred, target_names=label_list, output_dict=True)
+        report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True)
         report['eval_time'] = diff_seconds
         dictionary_to_json(report, os.path.join(args.output_dir, "test_results.json"))
 
diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py
index 0561cda1..8cc79122 100644
--- a/TernaryBERT/quant_task_multiemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -114,6 +114,9 @@ def main():
                         type=str,
                         help="The output directory where the model predictions and checkpoints will be written.")
 
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
     parser.add_argument("--learning_rate",
                         default=2e-5,
                         type=float,
@@ -122,6 +125,12 @@ def main():
                         default=3.0,
                         type=float,
                         help="Total number of training epochs to perform.")
+    parser.add_argument('--weight_decay', '--wd',
+                        default=0.01,
+                        type=float,
+                        metavar='W',
+                        help='weight decay')
+
     parser.add_argument('--seed',
                         type=int,
                         default=42,
@@ -161,9 +170,8 @@ def main():
     assert args.pred_distill or args.intermediate_distill, "'pred_distill' and 'intermediate_distill', at least one must be True"
     logger.info('The args: {}'.format(args))
     task_name = args.task_name.lower()
-    data_dir = os.path.join(args.data_dir)
+    data_dir = args.data_dir
     output_dir = os.path.join(args.output_dir, task_name)
-    # processed_data_dir = os.path.join(args.data_dir,'preprocessed',task_name)
 
     if not os.path.exists(output_dir):
         os.mkdir(output_dir)
@@ -219,7 +227,7 @@ def main():
     label_list = processor.get_labels()
     num_labels = len(label_list)
 
-    tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=True)
+    tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=args.do_lower_case)
 
     if args.aug_train:
         train_examples = processor.get_aug_examples(data_dir)
@@ -278,16 +286,20 @@ def main():
     # Prepare optimizer
     param_optimizer = list(student_model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+         'weight_decay': args.weight_decay},
         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
     ]
     schedule = 'warmup_linear'
-    optimizer = BertAdam(optimizer_grouped_parameters,
-                         schedule=schedule,
-                         lr=args.learning_rate,
-                         warmup=0.1,
-                         t_total=num_train_optimization_steps)
+    optimizer = BertAdam(
+        optimizer_grouped_parameters,
+        schedule=schedule,
+        lr=args.learning_rate,
+        warmup=0.1,
+        t_total=num_train_optimization_steps
+    )
     loss_mse = MSELoss()
     global_step = 0
     best_dev_acc = 0.0
diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py
new file mode 100644
index 00000000..d4e0d94a
--- /dev/null
+++ b/TernaryBERT/run_experiments.py
@@ -0,0 +1,85 @@
+import logging
+import os
+import sys
+
+PROJECT_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data')
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt='%d/%m/%Y %H:%M:%S')
+logger = logging.getLogger(__name__)
+
+data_dir = os.path.join('data', 'multiemo2')
+
+num_train_epochs = 3
+learning_rate = 5e-5
+weight_decay = 0.01
+
+
+def main():
+    os.chdir(PROJECT_FOLDER)
+
+    if not os.path.exists(os.path.join(DATA_FOLDER, 'multiemo2')):
+        logger.info("Downloading Multiemo data")
+        cmd = 'python3 -m scripts.download_dataset --data_dir data/multiemo2'
+        run_process(cmd)
+        logger.info("Downloading finished")
+
+    if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')):
+        logger.info("Downloading bert-base-uncased model")
+        cmd = 'python3 -m download_bert_base'
+        run_process(cmd)
+        logger.info("Downloading finished")
+
+    if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')):
+        logger.info("Downloading bert-base-uncased model")
+        cmd = 'python3 -m download_bert_base'
+        run_process(cmd)
+        logger.info("Downloading finished")
+
+    if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased', 'multiemo_en_all_sentence')):
+        cmd = 'python3 -m multiemo_fine_tune_bert '
+        options = [
+            '--pretrained_model', 'data/models/bert-base-uncased',
+            '--data_dir', 'data/multiemo2',
+            '--task_name', 'multiemo_en_all_sentence',
+            '--output_dir', 'data/models/bert-base-uncased/multiemo_en_all_sentence',
+            '--do_lower_case'
+        ]
+        cmd += ' '.join(options)
+        logger.info(f"Training bert-base-uncased for multiemo_en_all_sentence")
+        run_process(cmd)
+
+    cmd = 'python3 -m quant_task_multiemo '
+    options = [
+        '--data_dir', 'data/multiemo2',
+        '--model_dir ', 'data/models/bert-base-uncased',
+        '--task_name', 'multiemo_en_all_sentence',
+        '--output_dir', 'data/models/ternarybert',
+        '--learning_rate', str(learning_rate),
+        '--num_train_epochs', str(num_train_epochs),
+        '--weight_decay', str(weight_decay),
+        '--weight_bits', str(2),
+        '--input_bits', str(8),
+        '--pred_distill',
+        '--intermediate_distill',
+        '--save_fp_model',
+        '--save_quantized_model',
+        '--do_lower_case'
+    ]
+    cmd += ' '.join(options)
+    logger.info(f"Training ternarybert for multiemo_en_all_sentence")
+    run_process(cmd)
+
+    # cmd = f'python3 -m gather_results --task_name multiemo_en_all_sentence'
+    # logger.info(f"Gathering results to csv for multiemo_en_all_sentence")
+    # run_process(cmd)
+
+
+def run_process(proc):
+    os.system(proc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/TernaryBERT/scripts/download_dataset.py b/TernaryBERT/scripts/download_dataset.py
index 08bdc6fb..701ffd3a 100644
--- a/TernaryBERT/scripts/download_dataset.py
+++ b/TernaryBERT/scripts/download_dataset.py
@@ -46,7 +46,7 @@ def main(data_dir):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
+    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='multiemo2')
     args = parser.parse_args()
 
     if not os.path.isdir(args.data_dir):
diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py
index b20e8f61..73241068 100644
--- a/TernaryBERT/utils_multiemo.py
+++ b/TernaryBERT/utils_multiemo.py
@@ -2,6 +2,7 @@
 import logging
 import sys
 import csv
+from typing import List
 
 import numpy as np
 from scipy.stats import pearsonr, spearmanr

From e5e731c66322776d06f6b38027a4cf7f25044fed Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 20:16:54 +0100
Subject: [PATCH 25/62] feat: fix Dockerfile

---
 TernaryBERT/Dockerfile       | 8 ++++----
 TernaryBERT/requirements.txt | 5 ++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/TernaryBERT/Dockerfile b/TernaryBERT/Dockerfile
index 81616e69..9e8f0495 100644
--- a/TernaryBERT/Dockerfile
+++ b/TernaryBERT/Dockerfile
@@ -4,17 +4,17 @@ ENV TZ=Europe/Minsk
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 RUN apt update && \
     apt install --no-install-recommends -y build-essential software-properties-common && \
-    apt install --no-install-recommends -y python3.8.12 python3-pip python3-dev python3-setuptools python3-distutils && \
+    apt install --no-install-recommends -y python3.6 python3-pip python3-dev python3-setuptools python3-distutils && \
     apt clean && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
 
-RUN python3.8 -m pip install --upgrade pip && \
-    python3.8 -m pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 \
+RUN python3 -m pip install --upgrade pip && \
+    python3 -m pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 \
     -f https://download.pytorch.org/whl/torch_stable.html
 
 COPY ./requirements.txt .
-RUN python3.8 -m pip install --no-cache-dir -r requirements.txt
+RUN python3 -m pip install --no-cache-dir -r requirements.txt
 RUN rm requirements.txt
 
 ARG USER_ID
diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt
index f86b0709..759eb3ae 100644
--- a/TernaryBERT/requirements.txt
+++ b/TernaryBERT/requirements.txt
@@ -3,5 +3,8 @@ requests
 scipy
 future
 Pillow
-tensorflow==1.14.0
+tensorflow~=1.14.0
+numpy~=1.21.2
+pandas~=1.3.3
+scikit-learn~=1.0
 # torch==1.1.0

From d63093a87f99180752725ee8a7857248f63c29ee Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 20:33:09 +0100
Subject: [PATCH 26/62] fixup! feat: fix Dockerfile

---
 TernaryBERT/Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/TernaryBERT/Dockerfile b/TernaryBERT/Dockerfile
index 9e8f0495..68c2c107 100644
--- a/TernaryBERT/Dockerfile
+++ b/TernaryBERT/Dockerfile
@@ -2,10 +2,10 @@ FROM nvidia/cuda:11.2.0-cudnn8-runtime-ubuntu20.04
 
 ENV TZ=Europe/Minsk
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-RUN apt update && \
-    apt install --no-install-recommends -y build-essential software-properties-common && \
-    apt install --no-install-recommends -y python3.6 python3-pip python3-dev python3-setuptools python3-distutils && \
-    apt clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y build-essential software-properties-common && \
+    apt-get install --no-install-recommends -y python3.6 python3-pip python3-dev python3-setuptools python3-distutils && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
 

From ace5d6b0c3e36a48203250788c70182aaac60a06 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 20:47:13 +0100
Subject: [PATCH 27/62] fixup! fixup! feat: fix Dockerfile

---
 TernaryBERT/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt
index 759eb3ae..71ce02ad 100644
--- a/TernaryBERT/requirements.txt
+++ b/TernaryBERT/requirements.txt
@@ -3,7 +3,7 @@ requests
 scipy
 future
 Pillow
-tensorflow~=1.14.0
+# tensorflow~=1.14.0
 numpy~=1.21.2
 pandas~=1.3.3
 scikit-learn~=1.0

From a6c511117560958441527c86a9ca7936d2973de7 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 20:56:03 +0100
Subject: [PATCH 28/62] feat: fix commands in runneing experiment script

---
 TernaryBERT/run_experiments.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py
index d4e0d94a..a2e28acc 100644
--- a/TernaryBERT/run_experiments.py
+++ b/TernaryBERT/run_experiments.py
@@ -22,24 +22,18 @@ def main():
 
     if not os.path.exists(os.path.join(DATA_FOLDER, 'multiemo2')):
         logger.info("Downloading Multiemo data")
-        cmd = 'python3 -m scripts.download_dataset --data_dir data/multiemo2'
+        cmd = 'python3 scripts/download_dataset.py --data_dir data/multiemo2'
         run_process(cmd)
         logger.info("Downloading finished")
 
     if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')):
         logger.info("Downloading bert-base-uncased model")
-        cmd = 'python3 -m download_bert_base'
-        run_process(cmd)
-        logger.info("Downloading finished")
-
-    if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')):
-        logger.info("Downloading bert-base-uncased model")
-        cmd = 'python3 -m download_bert_base'
+        cmd = 'python3 download_bert_base.py'
         run_process(cmd)
         logger.info("Downloading finished")
 
     if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased', 'multiemo_en_all_sentence')):
-        cmd = 'python3 -m multiemo_fine_tune_bert '
+        cmd = 'python3 multiemo_fine_tune_bert.py '
         options = [
             '--pretrained_model', 'data/models/bert-base-uncased',
             '--data_dir', 'data/multiemo2',
@@ -51,7 +45,7 @@ def main():
         logger.info(f"Training bert-base-uncased for multiemo_en_all_sentence")
         run_process(cmd)
 
-    cmd = 'python3 -m quant_task_multiemo '
+    cmd = 'python3 quant_task_multiemo.py '
     options = [
         '--data_dir', 'data/multiemo2',
         '--model_dir ', 'data/models/bert-base-uncased',

From add490329e4083d2fec2111fd364e2dfad812811 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 21:01:19 +0100
Subject: [PATCH 29/62] fixup! feat: fix commands in runneing experiment script

---
 TernaryBERT/run_experiments.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py
index a2e28acc..c62ef670 100644
--- a/TernaryBERT/run_experiments.py
+++ b/TernaryBERT/run_experiments.py
@@ -2,7 +2,7 @@
 import os
 import sys
 
-PROJECT_FOLDER = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__))
 DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data')
 
 log_format = '%(asctime)s %(message)s'
@@ -18,6 +18,7 @@
 
 
 def main():
+    print(PROJECT_FOLDER)
     os.chdir(PROJECT_FOLDER)
 
     if not os.path.exists(os.path.join(DATA_FOLDER, 'multiemo2')):

From 6265a5f707211814c02c97416a6a1cf3d7420573 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 21:03:00 +0100
Subject: [PATCH 30/62] fixup! fixup! feat: fix commands in runneing experiment
 script

---
 TernaryBERT/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt
index 71ce02ad..7dc69627 100644
--- a/TernaryBERT/requirements.txt
+++ b/TernaryBERT/requirements.txt
@@ -7,4 +7,5 @@ Pillow
 numpy~=1.21.2
 pandas~=1.3.3
 scikit-learn~=1.0
+tqdm
 # torch==1.1.0

From 05e145e3dc5494960be81e033e6b369799bd03a8 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 21:17:56 +0100
Subject: [PATCH 31/62] feat: fix loading data

---
 TernaryBERT/quant_task_multiemo.py | 5 -----
 TernaryBERT/utils_multiemo.py      | 7 +++++++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py
index 8cc79122..1d4e959d 100644
--- a/TernaryBERT/quant_task_multiemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -1,16 +1,11 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
-import logging
-import os
 import random
-import sys
-import pickle
 import copy
 import time
 from datetime import timedelta
 
-import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.nn import CrossEntropyLoss, MSELoss
diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py
index 73241068..56238d50 100644
--- a/TernaryBERT/utils_multiemo.py
+++ b/TernaryBERT/utils_multiemo.py
@@ -74,6 +74,13 @@ def _read_tsv(cls, input_file, quotechar=None):
                 lines.append(line)
             return lines
 
+    @classmethod
+    def _read_txt(cls, input_file: str) -> List[str]:
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding='UTF-8') as f:
+            lines = f.read().splitlines()
+        return lines
+
 
 class MultiemoProcessor(DataProcessor):
     """Processor for the Multiemo data2 set"""

From 406cacbdaf7a7ba2ce2da6b3f9e1ff025c1ab42f Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 21:20:31 +0100
Subject: [PATCH 32/62] feat: handle no directory error

---
 TernaryBERT/multiemo_fine_tune_bert.py | 4 ++--
 TernaryBERT/quant_task_multiemo.py     | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py
index a41b4d69..66fa9923 100644
--- a/TernaryBERT/multiemo_fine_tune_bert.py
+++ b/TernaryBERT/multiemo_fine_tune_bert.py
@@ -224,8 +224,8 @@ def main():
     # Prepare task settings
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
+
+    os.makedirs(args.output_dir, exist_ok=True)
 
     task_name = args.task_name.lower()
 
diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py
index 1d4e959d..82156815 100644
--- a/TernaryBERT/quant_task_multiemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -168,8 +168,7 @@ def main():
     data_dir = args.data_dir
     output_dir = os.path.join(args.output_dir, task_name)
 
-    if not os.path.exists(output_dir):
-        os.mkdir(output_dir)
+    os.makedirs(output_dir, exist_ok=True)
 
     if args.student_model is None:
         args.student_model = os.path.join(args.model_dir, task_name)

From 58f9d0514f665468d491318e2c21a90d502bb9a1 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 21:26:32 +0100
Subject: [PATCH 33/62] feat: correct metrics counting

---
 TernaryBERT/utils_multiemo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py
index 56238d50..bc99d1c9 100644
--- a/TernaryBERT/utils_multiemo.py
+++ b/TernaryBERT/utils_multiemo.py
@@ -230,7 +230,7 @@ def acc_and_f1(preds, labels):
 def compute_metrics(task_name, logits, labels):
     preds = np.argmax(logits, axis=1)
     assert len(preds) == len(labels)
-    if task_name == "multiemo":
+    if 'multiemo' in task_name:
         return acc_and_f1(preds, labels)
     else:
         raise KeyError(task_name)

From 5cddb66f55fb09c039691057cf8c6cb71dd0beee Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sat, 27 Nov 2021 21:37:32 +0100
Subject: [PATCH 34/62] feat: correct fr score calculating and batch size issue

---
 TernaryBERT/multiemo_fine_tune_bert.py | 9 ++-------
 TernaryBERT/run_experiments.py         | 5 +++++
 TernaryBERT/utils_multiemo.py          | 8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py
index 66fa9923..ef867829 100644
--- a/TernaryBERT/multiemo_fine_tune_bert.py
+++ b/TernaryBERT/multiemo_fine_tune_bert.py
@@ -21,15 +21,10 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
-import csv
-import logging
-import os
 import random
-import sys
 import time
 from datetime import timedelta
 
-import numpy as np
 import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
 from tqdm import tqdm, trange
@@ -140,11 +135,11 @@ def main():
                         action='store_true',
                         help="Set this flag if you are using an uncased model.")
     parser.add_argument("--train_batch_size",
-                        default=32,
+                        default=16,
                         type=int,
                         help="Total batch size for training.")
     parser.add_argument("--eval_batch_size",
-                        default=32,
+                        default=16,
                         type=int,
                         help="Total batch size for eval.")
     parser.add_argument("--learning_rate",
diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py
index c62ef670..0c0882f4 100644
--- a/TernaryBERT/run_experiments.py
+++ b/TernaryBERT/run_experiments.py
@@ -12,6 +12,7 @@
 
 data_dir = os.path.join('data', 'multiemo2')
 
+batch_size = 16
 num_train_epochs = 3
 learning_rate = 5e-5
 weight_decay = 0.01
@@ -40,6 +41,10 @@ def main():
             '--data_dir', 'data/multiemo2',
             '--task_name', 'multiemo_en_all_sentence',
             '--output_dir', 'data/models/bert-base-uncased/multiemo_en_all_sentence',
+            '--learning_rate', str(learning_rate),
+            '--num_train_epochs', str(num_train_epochs),
+            '--weight_decay', str(weight_decay),
+            '--train_batch_size', str(batch_size),
             '--do_lower_case'
         ]
         cmd += ' '.join(options)
diff --git a/TernaryBERT/utils_multiemo.py b/TernaryBERT/utils_multiemo.py
index bc99d1c9..e7ba2a24 100644
--- a/TernaryBERT/utils_multiemo.py
+++ b/TernaryBERT/utils_multiemo.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import matthews_corrcoef, f1_score
+from sklearn.metrics import matthews_corrcoef, f1_score, accuracy_score
 
 logger = logging.getLogger()
 
@@ -214,12 +214,12 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
 
 
 def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
+    return accuracy_score(y_true=labels, y_pred=preds)
 
 
 def acc_and_f1(preds, labels):
-    acc = simple_accuracy(preds, labels)
-    f1 = f1_score(y_true=labels, y_pred=preds)
+    acc = accuracy_score(y_true=labels, y_pred=preds)
+    f1 = f1_score(y_true=labels, y_pred=preds, average='macro')
     return {
         "acc": acc,
         "f1": f1,

From dc94fc237157fb6b379fd33a71b2c5edea4d4cff Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sun, 28 Nov 2021 09:31:05 +0100
Subject: [PATCH 35/62] feat: refactor training loops

---
 TernaryBERT/multiemo_fine_tune_bert.py | 161 +++++++++-------------
 TernaryBERT/quant_task_multiemo.py     | 177 +++++++++++++------------
 2 files changed, 156 insertions(+), 182 deletions(-)

diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py
index ef867829..23439d93 100644
--- a/TernaryBERT/multiemo_fine_tune_bert.py
+++ b/TernaryBERT/multiemo_fine_tune_bert.py
@@ -167,10 +167,6 @@ def main():
                         type=int,
                         default=42,
                         help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
 
     # added arguments
     parser.add_argument('--aug_train',
@@ -185,19 +181,8 @@ def main():
     # intermediate distillation default parameters
     default_params = {
         "multiemo": {"num_train_epochs": 3, "max_seq_length": 128},
-        "cola": {"num_train_epochs": 3, "max_seq_length": 64},
-        "mnli": {"num_train_epochs": 3, "max_seq_length": 128},
-        "mrpc": {"num_train_epochs": 3, "max_seq_length": 128},
-        "sst-2": {"num_train_epochs": 3, "max_seq_length": 64},
-        "sts-b": {"num_train_epochs": 3, "max_seq_length": 128},
-        "qqp": {"num_train_epochs": 3, "max_seq_length": 128},
-        "qnli": {"num_train_epochs": 3, "max_seq_length": 128},
-        "rte": {"num_train_epochs": 5, "max_seq_length": 128}
     }
-
-    acc_tasks = ["multiemo", "mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
-    corr_tasks = ["sts-b"]
-    mcc_tasks = ["cola"]
+    acc_tasks = ["multiemo"]
 
     # Prepare devices
     device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
@@ -206,7 +191,6 @@ def main():
     logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                         datefmt='%m/%d/%Y %H:%M:%S',
                         level=logging.INFO)
-
     logger.info("device: {} n_gpu: {}".format(device, n_gpu))
 
     # Prepare seed
@@ -223,7 +207,6 @@ def main():
     os.makedirs(args.output_dir, exist_ok=True)
 
     task_name = args.task_name.lower()
-
     if task_name in default_params:
         args.max_seq_len = default_params[task_name]["max_seq_length"]
 
@@ -252,14 +235,8 @@ def main():
             train_examples = processor.get_train_examples(args.data_dir)
         else:
             train_examples = processor.get_aug_examples(args.data_dir)
-        if args.gradient_accumulation_steps < 1:
-            raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                args.gradient_accumulation_steps))
-
-        args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
 
-        num_train_optimization_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+        num_train_optimization_steps = int(len(train_examples) / args.train_batch_size) * args.num_train_epochs
 
         train_features = convert_examples_to_features(train_examples, label_list,
                                                       args.max_seq_length, tokenizer, output_mode)
@@ -295,49 +272,28 @@ def main():
         logger.info("  Num steps = %d", num_train_optimization_steps)
         if n_gpu > 1:
             model = torch.nn.DataParallel(model)
-        # Prepare optimizer
-        param_optimizer = list(model.named_parameters())
-        size = 0
-        for n, p in model.named_parameters():
-            logger.info('n: {}'.format(n))
-            size += p.nelement()
-
-        logger.info('Total parameters: {}'.format(size))
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
-             'weight_decay': args.weight_decay},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-        schedule = 'warmup_linear'
-
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                             schedule=schedule,
-                             lr=args.learning_rate,
-                             warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
+
+        optimizer = get_optimizer(args, model, num_train_optimization_steps)
 
         # Train and evaluate
         global_step = 0
         best_dev_acc = 0.0
         output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
 
-        for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"):
+        for epoch_ in range(int(args.num_train_epochs)):
             tr_loss = 0.
             tr_cls_loss = 0.
 
             model.train()
             nb_tr_examples, nb_tr_steps = 0, 0
 
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", ascii=True)):
+            for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)):
                 batch = tuple(t.to(device) for t in batch)
-
                 input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch
                 if input_ids.size()[0] != args.train_batch_size:
                     continue
 
                 cls_loss = 0.
-
                 logits, _, _ = model(input_ids, segment_ids, input_mask)
 
                 if output_mode == "classification":
@@ -352,68 +308,53 @@ def main():
 
                 if n_gpu > 1:
                     loss = loss.mean()  # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
 
                 loss.backward()
-
                 tr_loss += loss.item()
                 nb_tr_examples += label_ids.size(0)
                 nb_tr_steps += 1
 
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
+                optimizer.step()
+                optimizer.zero_grad()
+                global_step += 1
 
-                if (global_step + 1) % args.eval_step == 0 or (global_step + 1) == 2 or \
-                        (global_step + 1) == num_train_optimization_steps:
-                    logger.info("***** Running evaluation *****")
-                    logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))
-                    logger.info("  Num examples = %d", len(eval_examples))
-                    logger.info("  Batch size = %d", args.eval_batch_size)
+            logger.info("***** Running evaluation *****")
+            logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))
+            logger.info("  Num examples = %d", len(eval_examples))
+            logger.info("  Batch size = %d", args.eval_batch_size)
 
-                    model.eval()
+            model.eval()
 
-                    loss = tr_loss / (step + 1)
-                    cls_loss = tr_cls_loss / (step + 1)
+            loss = tr_loss / nb_tr_steps
+            cls_loss = tr_cls_loss / nb_tr_steps
 
-                    result, _ = do_eval(model, task_name, eval_dataloader,
-                                        device, output_mode, eval_labels, num_labels)
-                    result['global_step'] = global_step
-                    result['cls_loss'] = cls_loss
-                    result['loss'] = loss
+            result, _ = do_eval(model, task_name, eval_dataloader,
+                                device, output_mode, eval_labels, num_labels)
+            result['epoch'] = epoch_ + 1
+            result['global_step'] = global_step
+            result['cls_loss'] = cls_loss
+            result['loss'] = loss
+            result_to_text_file(result, output_eval_file)
 
-                    result_to_text_file(result, output_eval_file)
+            save_model = False
 
-                    save_model = False
+            if task_name in acc_tasks and result['acc'] > best_dev_acc:
+                best_dev_acc = result['acc']
+                save_model = True
 
-                    if task_name in acc_tasks and result['acc'] > best_dev_acc:
-                        best_dev_acc = result['acc']
-                        save_model = True
+            if save_model:
+                logger.info("***** Save model *****")
+                model_to_save = model.module if hasattr(model, 'module') else model
 
-                    if task_name in corr_tasks and result['corr'] > best_dev_acc:
-                        best_dev_acc = result['corr']
-                        save_model = True
+                model_name = WEIGHTS_NAME
+                output_model_file = os.path.join(args.output_dir, model_name)
+                output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
 
-                    if task_name in mcc_tasks and result['mcc'] > best_dev_acc:
-                        best_dev_acc = result['mcc']
-                        save_model = True
+                torch.save(model_to_save.state_dict(), output_model_file)
+                model_to_save.config.to_json_file(output_config_file)
+                tokenizer.save_vocabulary(args.output_dir)
 
-                    if save_model:
-                        logger.info("***** Save model *****")
-                        model_to_save = model.module if hasattr(model, 'module') else model
-
-                        model_name = WEIGHTS_NAME
-
-                        output_model_file = os.path.join(args.output_dir, model_name)
-                        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-                        torch.save(model_to_save.state_dict(), output_model_file)
-                        model_to_save.config.to_json_file(output_config_file)
-                        tokenizer.save_vocabulary(args.output_dir)
-
-                    model.train()
+            model.train()
 
         # Measure End Time
         training_end_time = time.monotonic()
@@ -436,13 +377,14 @@ def main():
 
         test_data, test_labels = get_tensor_data(output_mode, test_features)
         test_sampler = SequentialSampler(eval_data)
-        test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+        test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.batch_size)
 
         logger.info("\n***** Running evaluation on test dataset *****")
         logger.info("  Num examples = %d", len(test_features))
         logger.info("  Batch size = %d", args.batch_size)
 
         eval_start_time = time.monotonic()
+        model.eval()
         result, y_logits = do_eval(model, task_name, test_dataloader,
                                    device, output_mode, test_labels, num_labels)
         eval_end_time = time.monotonic()
@@ -461,5 +403,30 @@ def main():
         dictionary_to_json(report, os.path.join(args.output_dir, "test_results.json"))
 
 
+def get_optimizer(args, model, num_train_optimization_steps):
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    size = 0
+    for n, p in model.named_parameters():
+        logger.info('n: {}'.format(n))
+        size += p.nelement()
+    logger.info('Total parameters: {}'.format(size))
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+         'weight_decay': args.weight_decay},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+    ]
+    schedule = 'warmup_linear'
+    optimizer = BertAdam(
+        optimizer_grouped_parameters,
+        schedule=schedule,
+        lr=args.learning_rate,
+        warmup=args.warmup_proportion,
+        t_total=num_train_optimization_steps
+    )
+    return optimizer
+
+
 if __name__ == "__main__":
     main()
diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py
index 82156815..cf4b1c09 100644
--- a/TernaryBERT/quant_task_multiemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -10,7 +10,7 @@
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.nn import CrossEntropyLoss, MSELoss
 from sklearn.metrics import classification_report
-from tqdm import trange
+from tqdm import trange, tqdm
 
 from transformer import BertForSequenceClassification, WEIGHTS_NAME, CONFIG_NAME
 from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification
@@ -240,7 +240,6 @@ def main():
     eval_examples = processor.get_dev_examples(data_dir)
     eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer,
                                                  output_mode)
-
     eval_data, eval_labels = get_tensor_data(output_mode, eval_features)
     eval_sampler = SequentialSampler(eval_data)
     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
@@ -277,23 +276,7 @@ def main():
     if n_gpu > 1:
         student_model = torch.nn.DataParallel(student_model)
 
-    # Prepare optimizer
-    param_optimizer = list(student_model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
-         'weight_decay': args.weight_decay},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-    ]
-    schedule = 'warmup_linear'
-    optimizer = BertAdam(
-        optimizer_grouped_parameters,
-        schedule=schedule,
-        lr=args.learning_rate,
-        warmup=0.1,
-        t_total=num_train_optimization_steps
-    )
+    optimizer = get_optimizer(args, num_train_optimization_steps, student_model)
     loss_mse = MSELoss()
     global_step = 0
     best_dev_acc = 0.0
@@ -307,7 +290,7 @@ def main():
     for epoch_ in trange(int(args.num_train_epochs)):
         nb_tr_examples, nb_tr_steps = 0, 0
 
-        for step, batch in enumerate(train_dataloader):
+        for step, batch in  enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)):
             student_model.train()
             batch = tuple(t.to(device) for t in batch)
             input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch
@@ -358,70 +341,73 @@ def main():
             tr_loss += loss.item()
             nb_tr_examples += label_ids.size(0)
             nb_tr_steps += 1
-            if global_step % args.eval_step == 0 or global_step == num_train_optimization_steps - 1:
-                logger.info("***** Running evaluation *****")
-                logger.info("  {} step of {} steps".format(global_step, num_train_optimization_steps))
-                if previous_best is not None:
-                    logger.info(f"{fp32_performance}\nPrevious best = {previous_best}")
-
-                student_model.eval()
-
-                loss = tr_loss / (step + 1)
-                cls_loss = tr_cls_loss / (step + 1)
-                att_loss = tr_att_loss / (step + 1)
-                rep_loss = tr_rep_loss / (step + 1)
-
-                result, _ = do_eval(student_model, task_name, eval_dataloader,
-                                    device, output_mode, eval_labels, num_labels)
-
-                result['global_step'] = global_step
-                result['cls_loss'] = cls_loss
-                result['att_loss'] = att_loss
-                result['rep_loss'] = rep_loss
-                result['loss'] = loss
-
-                result_to_text_file(result, output_eval_file)
-
-                save_model = False
-
-                if task_name in acc_tasks and result['acc'] > best_dev_acc:
-                    previous_best = f"f1/acc:{result['f1']}/{result['acc']}"
-                    best_dev_acc = result['acc']
-                    save_model = True
-
-                if save_model:
-                    logger.info(fp32_performance)
-                    logger.info(previous_best)
-                    if args.save_fp_model:
-                        logger.info("******************** Save full precision model ********************")
-                        model_to_save = student_model.module if hasattr(student_model, 'module') else student_model
-                        output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
-                        output_config_file = os.path.join(output_dir, CONFIG_NAME)
-
-                        torch.save(model_to_save.state_dict(), output_model_file)
-                        model_to_save.config.to_json_file(output_config_file)
-                        tokenizer.save_vocabulary(output_dir)
-                    if args.save_quantized_model:
-                        logger.info("******************** Save quantized model ********************")
-                        output_quant_dir = os.path.join(output_dir, 'quant')
-                        if not os.path.exists(output_quant_dir):
-                            os.makedirs(output_quant_dir)
-                        model_to_save = student_model.module if hasattr(student_model, 'module') else student_model
-                        quant_model = copy.deepcopy(model_to_save)
-                        for name, module in quant_model.named_modules():
-                            if hasattr(module, 'weight_quantizer'):
-                                module.weight.data = module.weight_quantizer.apply(
-                                    module.weight,
-                                    module.weight_clip_val,
-                                    module.weight_bits, True
-                                )
-
-                        output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME)
-                        output_config_file = os.path.join(output_quant_dir, CONFIG_NAME)
-
-                        torch.save(quant_model.state_dict(), output_model_file)
-                        model_to_save.config.to_json_file(output_config_file)
-                        tokenizer.save_vocabulary(output_quant_dir)
+
+
+        logger.info("***** Running evaluation *****")
+        logger.info("  {} step of {} steps".format(global_step, num_train_optimization_steps))
+        if previous_best is not None:
+            logger.info(f"{fp32_performance}\nPrevious best = {previous_best}")
+
+        student_model.eval()
+
+        loss = tr_loss / nb_tr_steps
+        cls_loss = tr_cls_loss / nb_tr_steps
+        att_loss = tr_att_loss / nb_tr_steps
+        rep_loss = tr_rep_loss / nb_tr_steps
+
+        result, _ = do_eval(student_model, task_name, eval_dataloader,
+                            device, output_mode, eval_labels, num_labels)
+
+        result['epoch'] = epoch_ + 1
+        result['global_step'] = global_step
+        result['cls_loss'] = cls_loss
+        result['att_loss'] = att_loss
+        result['rep_loss'] = rep_loss
+        result['loss'] = loss
+
+        result_to_text_file(result, output_eval_file)
+
+        save_model = False
+
+        if task_name in acc_tasks and result['acc'] > best_dev_acc:
+            previous_best = f"f1/acc:{result['f1']}/{result['acc']}"
+            best_dev_acc = result['acc']
+            save_model = True
+
+        if save_model:
+            logger.info(fp32_performance)
+            logger.info(previous_best)
+            if args.save_fp_model:
+                logger.info("******************** Save full precision model ********************")
+                model_to_save = student_model.module if hasattr(student_model, 'module') else student_model
+                output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
+                output_config_file = os.path.join(output_dir, CONFIG_NAME)
+
+                torch.save(model_to_save.state_dict(), output_model_file)
+                model_to_save.config.to_json_file(output_config_file)
+                tokenizer.save_vocabulary(output_dir)
+
+            if args.save_quantized_model:
+                logger.info("******************** Save quantized model ********************")
+                output_quant_dir = os.path.join(output_dir, 'quant')
+                if not os.path.exists(output_quant_dir):
+                    os.makedirs(output_quant_dir)
+                model_to_save = student_model.module if hasattr(student_model, 'module') else student_model
+                quant_model = copy.deepcopy(model_to_save)
+                for name, module in quant_model.named_modules():
+                    if hasattr(module, 'weight_quantizer'):
+                        module.weight.data = module.weight_quantizer.apply(
+                            module.weight,
+                            module.weight_clip_val,
+                            module.weight_bits, True
+                        )
+
+                output_model_file = os.path.join(output_quant_dir, WEIGHTS_NAME)
+                output_config_file = os.path.join(output_quant_dir, CONFIG_NAME)
+
+                torch.save(quant_model.state_dict(), output_model_file)
+                model_to_save.config.to_json_file(output_config_file)
+                tokenizer.save_vocabulary(output_quant_dir)
 
     # Measure End Time
     training_end_time = time.monotonic()
@@ -444,13 +430,14 @@ def main():
 
     test_data, test_labels = get_tensor_data(output_mode, test_features)
     test_sampler = SequentialSampler(eval_data)
-    test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+    test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.batch_size)
 
     logger.info("\n***** Running evaluation on test dataset *****")
     logger.info("  Num examples = %d", len(test_features))
     logger.info("  Batch size = %d", args.batch_size)
 
     eval_start_time = time.monotonic()
+    student_model.eval()
     result, y_logits = do_eval(student_model, task_name, test_dataloader,
                                device, output_mode, test_labels, num_labels)
     eval_end_time = time.monotonic()
@@ -469,5 +456,25 @@ def main():
     dictionary_to_json(report, os.path.join(output_dir, "test_results.json"))
 
 
+def get_optimizer(args, num_train_optimization_steps, student_model):
+    # Prepare optimizer
+    param_optimizer = list(student_model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+         'weight_decay': args.weight_decay},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+    ]
+    schedule = 'warmup_linear'
+    optimizer = BertAdam(
+        optimizer_grouped_parameters,
+        schedule=schedule,
+        lr=args.learning_rate,
+        warmup=0.1,
+        t_total=num_train_optimization_steps
+    )
+    return optimizer
+
+
 if __name__ == "__main__":
     main()

From 0bf88607c29f07530a585894665a49bed6ddf7e9 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sun, 28 Nov 2021 11:34:47 +0100
Subject: [PATCH 36/62] feat: correct bacth size issue

---
 TernaryBERT/multiemo_fine_tune_bert.py | 6 +++---
 TernaryBERT/quant_task_multiemo.py     | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py
index 23439d93..c97e681d 100644
--- a/TernaryBERT/multiemo_fine_tune_bert.py
+++ b/TernaryBERT/multiemo_fine_tune_bert.py
@@ -70,7 +70,7 @@ def do_eval(model, task_name, eval_dataloader,
     nb_eval_steps = 0
     all_logits = None
 
-    for _, batch_ in enumerate(eval_dataloader):
+    for batch_ in tqdm(eval_dataloader):
         batch_ = tuple(t.to(device) for t in batch_)
         with torch.no_grad():
             input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_
@@ -377,11 +377,11 @@ def main():
 
         test_data, test_labels = get_tensor_data(output_mode, test_features)
         test_sampler = SequentialSampler(eval_data)
-        test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.batch_size)
+        test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.eval_batch_size)
 
         logger.info("\n***** Running evaluation on test dataset *****")
         logger.info("  Num examples = %d", len(test_features))
-        logger.info("  Batch size = %d", args.batch_size)
+        logger.info("  Batch size = %d", args.eval_batch_size)
 
         eval_start_time = time.monotonic()
         model.eval()
diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py
index cf4b1c09..99efa80e 100644
--- a/TernaryBERT/quant_task_multiemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -46,7 +46,7 @@ def do_eval(model, task_name, eval_dataloader,
     nb_eval_steps = 0
     all_logits = None
 
-    for _, batch_ in enumerate(eval_dataloader):
+    for batch_ in tqdm(eval_dataloader):
         batch_ = tuple(t.to(device) for t in batch_)
         with torch.no_grad():
             input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_
@@ -290,7 +290,7 @@ def main():
     for epoch_ in trange(int(args.num_train_epochs)):
         nb_tr_examples, nb_tr_steps = 0, 0
 
-        for step, batch in  enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)):
+        for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)):
             student_model.train()
             batch = tuple(t.to(device) for t in batch)
             input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch
@@ -342,7 +342,6 @@ def main():
             nb_tr_examples += label_ids.size(0)
             nb_tr_steps += 1
 
-
         logger.info("***** Running evaluation *****")
         logger.info("  {} step of {} steps".format(global_step, num_train_optimization_steps))
         if previous_best is not None:

From e4da554d2b1f0b05c61ca0044274b8a06ad3a412 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sun, 28 Nov 2021 11:43:00 +0100
Subject: [PATCH 37/62] feat: correct saving condition

---
 TernaryBERT/multiemo_fine_tune_bert.py | 2 +-
 TernaryBERT/quant_task_multiemo.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py
index c97e681d..57373c9e 100644
--- a/TernaryBERT/multiemo_fine_tune_bert.py
+++ b/TernaryBERT/multiemo_fine_tune_bert.py
@@ -338,7 +338,7 @@ def main():
 
             save_model = False
 
-            if task_name in acc_tasks and result['acc'] > best_dev_acc:
+            if result['acc'] > best_dev_acc:
                 best_dev_acc = result['acc']
                 save_model = True
 
diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py
index 99efa80e..c91cde7a 100644
--- a/TernaryBERT/quant_task_multiemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -368,7 +368,7 @@ def main():
 
         save_model = False
 
-        if task_name in acc_tasks and result['acc'] > best_dev_acc:
+        if result['acc'] > best_dev_acc:
             previous_best = f"f1/acc:{result['f1']}/{result['acc']}"
             best_dev_acc = result['acc']
             save_model = True

From 8bdb9e272c726bad371771891eda54c7fcf89438 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sun, 28 Nov 2021 12:39:55 +0100
Subject: [PATCH 38/62] feat: correct test data loading

---
 TernaryBERT/multiemo_fine_tune_bert.py | 2 +-
 TernaryBERT/quant_task_multiemo.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py
index 57373c9e..f4d7eb72 100644
--- a/TernaryBERT/multiemo_fine_tune_bert.py
+++ b/TernaryBERT/multiemo_fine_tune_bert.py
@@ -377,7 +377,7 @@ def main():
 
         test_data, test_labels = get_tensor_data(output_mode, test_features)
         test_sampler = SequentialSampler(eval_data)
-        test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.eval_batch_size)
+        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size)
 
         logger.info("\n***** Running evaluation on test dataset *****")
         logger.info("  Num examples = %d", len(test_features))
diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py
index c91cde7a..cd577792 100644
--- a/TernaryBERT/quant_task_multiemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -132,7 +132,7 @@ def main():
                         help="random seed for initialization")
 
     parser.add_argument('--aug_train',
-                        action='store_false',
+                        action='store_true',
                         help="Whether to use augmented data or not")
     parser.add_argument('--pred_distill',
                         action='store_true',
@@ -429,7 +429,7 @@ def main():
 
     test_data, test_labels = get_tensor_data(output_mode, test_features)
     test_sampler = SequentialSampler(eval_data)
-    test_dataloader = DataLoader(eval_data, sampler=test_sampler, batch_size=args.batch_size)
+    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.batch_size)
 
     logger.info("\n***** Running evaluation on test dataset *****")
     logger.info("  Num examples = %d", len(test_features))

From 11ce3b796e8724b274c437f4f223fe3e92949160 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sun, 28 Nov 2021 14:12:26 +0100
Subject: [PATCH 39/62] fixup! feat: correct test data loading

---
 TernaryBERT/multiemo_fine_tune_bert.py |  2 +-
 TernaryBERT/quant_task_multiemo.py     | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/TernaryBERT/multiemo_fine_tune_bert.py b/TernaryBERT/multiemo_fine_tune_bert.py
index f4d7eb72..8ea25d40 100644
--- a/TernaryBERT/multiemo_fine_tune_bert.py
+++ b/TernaryBERT/multiemo_fine_tune_bert.py
@@ -376,7 +376,7 @@ def main():
                                                      output_mode)
 
         test_data, test_labels = get_tensor_data(output_mode, test_features)
-        test_sampler = SequentialSampler(eval_data)
+        test_sampler = SequentialSampler(test_data)
         test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size)
 
         logger.info("\n***** Running evaluation on test dataset *****")
diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py
index cd577792..02a006f1 100644
--- a/TernaryBERT/quant_task_multiemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -184,7 +184,7 @@ def main():
     }
 
     default_params = {
-        "multiemo": {"max_seq_length": 128, "batch_size": 16, "eval_step": 50}
+        "multiemo": {"max_seq_length": 128, "batch_size": 16}
     }
 
     acc_tasks = ["multiemo"]
@@ -205,7 +205,11 @@ def main():
         if n_gpu > 0:
             args.batch_size = int(args.batch_size * n_gpu)
         args.max_seq_length = default_params[task_name]["max_seq_length"]
-        args.eval_step = default_params[task_name]["eval_step"]
+    elif 'multiemo' in task_name:
+        args.batch_size = default_params['multiemo']["batch_size"]
+        if n_gpu > 0:
+            args.batch_size = int(args.batch_size * n_gpu)
+        args.max_seq_length = default_params['multiemo']["max_seq_length"]
 
     if 'multiemo' in task_name:
         _, lang, domain, kind = task_name.split('_')
@@ -428,7 +432,7 @@ def main():
                                                  output_mode)
 
     test_data, test_labels = get_tensor_data(output_mode, test_features)
-    test_sampler = SequentialSampler(eval_data)
+    test_sampler = SequentialSampler(test_data)
     test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.batch_size)
 
     logger.info("\n***** Running evaluation on test dataset *****")

From 60555878b777ac79cf76369351f0be7dc8cda5db Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Sun, 28 Nov 2021 15:52:16 +0100
Subject: [PATCH 40/62] feat: fix loading teacher model

---
 TernaryBERT/quant_task_multiemo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py
index 02a006f1..4a0c43b7 100644
--- a/TernaryBERT/quant_task_multiemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -248,7 +248,7 @@ def main():
     eval_sampler = SequentialSampler(eval_data)
     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
 
-    teacher_model = BertForSequenceClassification.from_pretrained(args.teacher_model)
+    teacher_model = BertForSequenceClassification.from_pretrained(args.teacher_model, num_labels=num_labels)
     teacher_model.to(device)
     teacher_model.eval()
     if n_gpu > 1:

From 98379e51d2121ebdbb2f440ad4ff3f7edfbae4ec Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 30 Nov 2021 10:40:06 +0100
Subject: [PATCH 41/62] feat: add data processing for multiemo

---
 .../transformers/data/metrics/__init__.py     |  19 ++
 .../transformers/data/processors/multiemo.py  | 186 ++++++++++++++++++
 .../transformers/data/processors/utils.py     |   9 +
 3 files changed, 214 insertions(+)
 create mode 100644 DynaBERT/transformers/data/processors/multiemo.py

diff --git a/DynaBERT/transformers/data/metrics/__init__.py b/DynaBERT/transformers/data/metrics/__init__.py
index c9ebaac3..942bd7b1 100644
--- a/DynaBERT/transformers/data/metrics/__init__.py
+++ b/DynaBERT/transformers/data/metrics/__init__.py
@@ -47,6 +47,16 @@ def acc_and_f1(preds, labels):
         }
 
 
+    def multiclass_acc_and_f1(preds, labels):
+        acc = accuracy_score(y_true=labels, y_pred=preds)
+        f1 = f1_score(y_true=labels, y_pred=preds, average='macro')
+        return {
+            "acc": acc,
+            "f1": f1,
+            "acc_and_f1": (acc + f1) / 2,
+        }
+
+
     def pearson_and_spearman(preds, labels):
         pearson_corr = pearsonr(preds, labels)[0]
         spearman_corr = spearmanr(preds, labels)[0]
@@ -81,3 +91,12 @@ def glue_compute_metrics(task_name, preds, labels):
             return {"acc": simple_accuracy(preds, labels)}
         else:
             raise KeyError(task_name)
+
+
+    def multiemo_compute_metrics(task_name, logits, labels):
+        preds = np.argmax(logits, axis=1)
+        assert len(preds) == len(labels)
+        if 'multiemo' in task_name:
+            return multiclass_acc_and_f1(preds, labels)
+        else:
+            raise KeyError(task_name)
diff --git a/DynaBERT/transformers/data/processors/multiemo.py b/DynaBERT/transformers/data/processors/multiemo.py
new file mode 100644
index 00000000..be4a0eb2
--- /dev/null
+++ b/DynaBERT/transformers/data/processors/multiemo.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" GLUE processors and helpers """
+
+import logging
+import os
+import numpy as np
+from .utils import DataProcessor, InputExample, InputFeatures
+from ...file_utils import is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.getLogger(__name__)
+
+
+def multiemo_convert_examples_to_features(examples, tokenizer,
+                                          max_length=512,
+                                          task=None,
+                                          label_list=None,
+                                          output_mode=None,
+                                          pad_on_left=False,
+                                          pad_token=0,
+                                          pad_token_segment_id=0,
+                                          mask_padding_with_zero=True):
+    """
+    Loads a data file into a list of ``InputFeatures``
+
+    Args:
+        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
+        tokenizer: Instance of a tokenizer that will tokenize the examples
+        max_length: Maximum example length
+        task: GLUE task
+        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
+        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
+        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
+        pad_token: Padding token
+        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
+        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
+            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
+            actual values)
+
+    Returns:
+        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
+        containing the task-specific features. If the input is a list of ``InputExamples``, will return
+        a list of task-specific ``InputFeatures`` which can be fed to the model.
+
+    """
+
+    if task is not None:
+        _, lang, domain, kind = task_name.split('_')
+        processor = MultiemoProcessor(lang, domain, kind)
+        if label_list is None:
+            label_list = processor.get_labels()
+            logger.info("Using label list %s for task %s" % (label_list, task))
+        if output_mode is None:
+            output_mode = multiemo_output_modes_output_modes[task]
+            logger.info("Using output mode %s for task %s" % (output_mode, task))
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d" % (ex_index))
+
+        inputs = tokenizer.encode_plus(
+            example.text_a,
+            example.text_b,
+            add_special_tokens=True,
+            max_length=max_length,
+        )
+        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding_length = max_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
+        else:
+            input_ids = input_ids + ([pad_token] * padding_length)
+            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+
+        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
+        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask),
+                                                                                            max_length)
+        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids),
+                                                                                            max_length)
+        if output_mode == "classification":
+            label = label_map[example.label]
+        elif output_mode == "regression":
+            label = float(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        if ex_index < 1:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
+            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label))
+
+        features.append(
+            InputFeatures(input_ids=input_ids,
+                          attention_mask=attention_mask,
+                          token_type_ids=token_type_ids,
+                          label=label))
+
+    return features
+
+
+class MultiemoProcessor(DataProcessor):
+    """Processor for the Multiemo data2 set"""
+
+    def __init__(self, lang: str, domain: str, kind: str):
+        super(MultiemoProcessor, self).__init__()
+        self.lang = lang.lower()
+        self.domain = domain.lower()
+        self.kind = kind.lower()
+
+    def get_train_examples(self, data_dir: str) -> List[InputExample]:
+        """See base class."""
+        file_path = self.get_set_type_path(data_dir, 'train')
+        logger.info(f"LOOKING AT {file_path}")
+        return self._create_examples(self._read_txt(file_path), "train")
+
+    def get_dev_examples(self, data_dir: str) -> List[InputExample]:
+        """See base class."""
+        file_path = self.get_set_type_path(data_dir, 'dev')
+        return self._create_examples(self._read_txt(file_path), "dev")
+
+    def get_test_examples(self, data_dir: str) -> List[InputExample]:
+        """See base class."""
+        file_path = self.get_set_type_path(data_dir, 'test')
+        return self._create_examples(self._read_txt(file_path), "test")
+
+    def get_set_type_path(self, data_dir: str, set_type: str) -> str:
+        return os.path.join(data_dir, self.domain + '.' + self.kind + '.' + set_type + '.' + self.lang + '.txt')
+
+    def get_labels(self) -> List[str]:
+        """See base class."""
+        if self.kind == 'text':
+            return ["meta_amb", "meta_minus_m", "meta_plus_m", "meta_zero"]
+        else:
+            return ["z_amb", "z_minus_m", "z_plus_m", "z_zero"]
+
+    @staticmethod
+    def _create_examples(lines: List[str], set_type: str) -> List[InputExample]:
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            split_line = line.split('__label__')
+            text_a = split_line[0]
+            label = split_line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+multiemo_tasks_num_labels = {
+    "multiemo": 4,
+}
+
+multiemo_output_modes = {
+    "multiemo": "classification"
+}
diff --git a/DynaBERT/transformers/data/processors/utils.py b/DynaBERT/transformers/data/processors/utils.py
index 2d7628f9..1556ef02 100644
--- a/DynaBERT/transformers/data/processors/utils.py
+++ b/DynaBERT/transformers/data/processors/utils.py
@@ -19,6 +19,7 @@
 import copy
 import json
 
+
 class InputExample(object):
     """
     A single training/test example for simple sequence classification.
@@ -32,6 +33,7 @@ class InputExample(object):
         label: (Optional) string. The label of the example. This should be
         specified for train and dev examples, but not for test examples.
     """
+
     def __init__(self, guid, text_a, text_b=None, label=None):
         self.guid = guid
         self.text_a = text_a
@@ -123,3 +125,10 @@ def _read_tsv(cls, input_file, quotechar=None):
                     line = list(unicode(cell, 'utf-8') for cell in line)
                 lines.append(line)
             return lines
+
+    @classmethod
+    def _read_txt(cls, input_file: str) -> List[str]:
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding='UTF-8') as f:
+            lines = f.read().splitlines()
+        return lines

From 51d250bacc73e70f6b60295a0643f16a284fee76 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 30 Nov 2021 16:02:11 +0100
Subject: [PATCH 42/62] feat: add training dynabert for multiemo

---
 DynaBERT/download_bert_base.py                |  31 +
 DynaBERT/multiemo_fine_tune_bert.py           | 429 +++++++++++++
 DynaBERT/run_glue.py                          |  36 +-
 DynaBERT/run_multiemo.py                      | 577 ++++++++++++++++++
 DynaBERT/scripts/download_dataset.py          |  55 ++
 .../transformers/data/metrics/__init__.py     |   2 +
 DynaBERT/utils.py                             |  33 +
 7 files changed, 1145 insertions(+), 18 deletions(-)
 create mode 100644 DynaBERT/download_bert_base.py
 create mode 100644 DynaBERT/multiemo_fine_tune_bert.py
 create mode 100644 DynaBERT/run_multiemo.py
 create mode 100644 DynaBERT/scripts/download_dataset.py
 create mode 100644 DynaBERT/utils.py

diff --git a/DynaBERT/download_bert_base.py b/DynaBERT/download_bert_base.py
new file mode 100644
index 00000000..fa99e41a
--- /dev/null
+++ b/DynaBERT/download_bert_base.py
@@ -0,0 +1,31 @@
+import os
+import requests
+import tarfile
+
+url = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz'
+
+output_path = os.path.join('data', 'models')
+os.makedirs(output_path, exist_ok=True)
+
+output_tar = os.path.join(output_path, 'bert-base-uncased.tar.gz')
+model_folder = os.path.join(output_path, 'bert-base-uncased')
+
+response = requests.get(url, stream=True)
+if response.status_code == 200:
+    with open(output_tar, 'wb') as f:
+        f.write(response.raw.read())
+
+with tarfile.open(name=output_tar, mode="r|gz") as tar_ref:
+    tar_ref.extractall(model_folder)
+
+os.rename(os.path.join(model_folder, 'bert_config.json'), os.path.join(model_folder, 'config.json'))
+
+os.remove(output_tar)
+
+url_vocab = 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt'
+r = requests.get(url_vocab)
+
+with open(os.path.join(model_folder, 'vocab.txt'), 'wb') as f:
+    f.write(r.content)
+
+print('Completed!')
diff --git a/DynaBERT/multiemo_fine_tune_bert.py b/DynaBERT/multiemo_fine_tune_bert.py
new file mode 100644
index 00000000..e0f92f0c
--- /dev/null
+++ b/DynaBERT/multiemo_fine_tune_bert.py
@@ -0,0 +1,429 @@
+# coding=utf-8
+# 2019.12.2-Changed for TinyBERT task-specific distillation
+#      Huawei Technologies Co., Ltd. <yinyichun@huawei.com>
+# Copyright 2020 Huawei Technologies Co., Ltd.
+# Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import csv
+import logging
+import os
+import random
+import sys
+import time
+from datetime import timedelta
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
+from tqdm import tqdm, trange
+
+from sklearn.metrics import classification_report
+
+from utils import result_to_text_file, dictionary_to_json
+from transformers.modeling_bert import BertForSequenceClassification
+from transformers.tokenization_bert import BertTokenizer
+from transformers.data.metrics import multiemo_compute_metrics as compute_metrics
+from transformers.data.processors.multiemo import multiemo_convert_examples_to_features as convert_examples_to_features, \
+    MultiemoProcessor
+from transformers import AdamW, WarmupLinearSchedule
+
+from transformers.file_utils import WEIGHTS_NAME, CONFIG_NAME
+
+csv.field_size_limit(sys.maxsize)
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt='%m/%d %I:%M:%S %p')
+fh = logging.FileHandler('debug_layer_loss.log')
+fh.setFormatter(logging.Formatter(log_format))
+logging.getLogger().addHandler(fh)
+logger = logging.getLogger()
+
+
+def get_tensor_data(output_mode, features):
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    return tensor_data, all_label_ids
+
+
+def do_eval(model, task_name, eval_dataloader,
+            device, output_mode, eval_labels, num_labels):
+    eval_loss = 0
+    nb_eval_steps = 0
+    all_logits = None
+
+    for batch in tqdm(eval_dataloader):
+        model.eval()
+        batch = tuple(t.to(device) for t in batch)
+        with torch.no_grad():
+            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]}
+            outputs = model(**inputs)
+            tmp_eval_loss, logits = outputs[:2]
+
+        eval_loss += tmp_eval_loss.mean().item()
+        nb_eval_steps += 1
+
+        if all_logits is None:
+            all_logits = logits.detach().cpu().numpy()
+        else:
+            all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0)
+
+    eval_loss = eval_loss / nb_eval_steps
+
+    if output_mode == "regression":
+        all_logits = np.squeeze(all_logits)
+    result = compute_metrics(task_name, all_logits, eval_labels.numpy())
+    result['eval_loss'] = eval_loss
+    return result, all_logits
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--pretrained_model",
+                        default=None,
+                        type=str,
+                        help="The pretrained model dir.")
+    parser.add_argument("--task_name",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The name of the task to train.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_eval",
+                        action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size",
+                        default=16,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size",
+                        default=16,
+                        type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument('--weight_decay', '--wd',
+                        default=0.01,
+                        type=float,
+                        metavar='W',
+                        help='weight decay')
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.1,
+                        type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+
+    # added arguments
+    parser.add_argument('--aug_train',
+                        action='store_true')
+    parser.add_argument('--eval_step',
+                        type=int,
+                        default=50)
+
+    args = parser.parse_args()
+    logger.info('The args: {}'.format(args))
+
+    # intermediate distillation default parameters
+    default_params = {
+        "multiemo": {"num_train_epochs": 3, "max_seq_length": 128},
+    }
+    acc_tasks = ["multiemo"]
+
+    # Prepare devices
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    n_gpu = torch.cuda.device_count()
+
+    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt='%m/%d/%Y %H:%M:%S',
+                        level=logging.INFO)
+    logger.info("device: {} n_gpu: {}".format(device, n_gpu))
+
+    # Prepare seed
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    # Prepare task settings
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    task_name = args.task_name.lower()
+    if task_name in default_params:
+        args.max_seq_len = default_params[task_name]["max_seq_length"]
+
+    if not args.do_eval:
+        if task_name in default_params:
+            args.num_train_epoch = default_params[task_name]["num_train_epochs"]
+
+    if 'multiemo' in task_name:
+        _, lang, domain, kind = task_name.split('_')
+        processor = MultiemoProcessor(lang, domain, kind)
+    else:
+        raise ValueError("Task not found: %s" % task_name)
+
+    if 'multiemo' in task_name:
+        output_mode = 'classification'
+    else:
+        raise ValueError("Task not found: %s" % task_name)
+
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    tokenizer = BertTokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case)
+
+    if not args.do_eval:
+        if not args.aug_train:
+            train_examples = processor.get_train_examples(args.data_dir)
+        else:
+            train_examples = processor.get_aug_examples(args.data_dir)
+
+        t_total = len(train_examples) // args.gradient_accumulation_steps * args.num_train_epochs
+
+        train_features = convert_examples_to_features(
+            train_examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=False,
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=0,
+        )
+        train_data, _ = get_tensor_data(output_mode, train_features)
+        train_sampler = RandomSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    eval_examples = processor.get_dev_examples(args.data_dir)
+    eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+    eval_data, eval_labels = get_tensor_data(output_mode, eval_features)
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    model = BertForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=num_labels)
+    model.to(device)
+    if args.do_eval:
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_examples))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+
+        model.eval()
+        result, _ = do_eval(model, task_name, eval_dataloader,
+                            device, output_mode, eval_labels, num_labels)
+        logger.info("***** Eval results *****")
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+    else:
+        training_start_time = time.monotonic()
+
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", t_total)
+        if n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
+        optimizer, scheduler = get_optimizer_and_scheduler(args, model, t_total)
+
+        # Train and evaluate
+        global_step = 0
+        best_dev_acc = 0.0
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+
+        for epoch_ in range(int(args.num_train_epochs)):
+            tr_loss = 0.
+            tr_cls_loss = 0.
+
+            model.train()
+            nb_tr_steps = 0
+
+            for step, batch in enumerate(tqdm(train_dataloader, f"Epoch {epoch_ + 1}: ", ascii=True)):
+                batch = tuple(t.to(device) for t in batch)
+                batch = tuple(t.to(args.device) for t in batch)
+                inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3],
+                          'token_type_ids': batch[2] if args.model_type in ['bert'] else None}
+
+                cls_loss = model(**inputs)[0]
+
+                loss = cls_loss
+                tr_cls_loss += cls_loss.item()
+
+                if n_gpu > 1:
+                    loss = loss.mean()  # mean() to average on multi-gpu.
+
+                loss.backward()
+                tr_loss += loss.item()
+                nb_tr_steps += 1
+
+                optimizer.step()
+                scheduler.step()
+                model.zero_grad()
+                global_step += 1
+
+            logger.info("***** Running evaluation *****")
+            logger.info("  Epoch = {} iter {} step".format(epoch_, global_step))
+            logger.info("  Num examples = %d", len(eval_examples))
+            logger.info("  Batch size = %d", args.eval_batch_size)
+
+            model.eval()
+
+            loss = tr_loss / nb_tr_steps
+            cls_loss = tr_cls_loss / nb_tr_steps
+
+            result, _ = do_eval(model, task_name, eval_dataloader,
+                                device, output_mode, eval_labels, num_labels)
+            result['epoch'] = epoch_ + 1
+            result['global_step'] = global_step
+            result['cls_loss'] = cls_loss
+            result['loss'] = loss
+            result_to_text_file(result, output_eval_file)
+
+            save_model = False
+
+            if result['acc'] > best_dev_acc:
+                best_dev_acc = result['acc']
+                save_model = True
+
+            if save_model:
+                logger.info("***** Save model *****")
+                model_to_save = model.module if hasattr(model, 'module') else model
+
+                model_name = WEIGHTS_NAME
+                output_model_file = os.path.join(args.output_dir, model_name)
+                output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+                torch.save(model_to_save.state_dict(), output_model_file)
+                model_to_save.config.to_json_file(output_config_file)
+                tokenizer.save_vocabulary(args.output_dir)
+
+            model.train()
+
+        # Measure End Time
+        training_end_time = time.monotonic()
+
+        diff = timedelta(seconds=training_end_time - training_start_time)
+        diff_seconds = diff.total_seconds()
+
+        training_parameters = vars(args)
+        training_parameters['training_time'] = diff_seconds
+
+        output_training_params_file = os.path.join(args.output_dir, "training_params.json")
+        dictionary_to_json(training_parameters, output_training_params_file)
+
+        #########################
+        #       Test model      #
+        #########################
+        test_examples = processor.get_test_examples(args.data_dir)
+        test_features = convert_examples_to_features(
+            test_examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=False,
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=0
+        )
+
+        test_data, test_labels = get_tensor_data(output_mode, test_features)
+        test_sampler = SequentialSampler(test_data)
+        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size)
+
+        logger.info("\n***** Running evaluation on test dataset *****")
+        logger.info("  Num examples = %d", len(test_features))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+
+        eval_start_time = time.monotonic()
+        model.eval()
+        result, y_logits = do_eval(model, task_name, test_dataloader,
+                                   device, output_mode, test_labels, num_labels)
+        eval_end_time = time.monotonic()
+
+        diff = timedelta(seconds=eval_end_time - eval_start_time)
+        diff_seconds = diff.total_seconds()
+        result['eval_time'] = diff_seconds
+        result_to_text_file(result, os.path.join(args.output_dir, "test_results.txt"))
+
+        y_pred = np.argmax(y_logits, axis=1)
+        print('\n\t**** Classification report ****\n')
+        print(classification_report(test_labels.numpy(), y_pred, target_names=label_list))
+
+        report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True)
+        report['eval_time'] = diff_seconds
+        dictionary_to_json(report, os.path.join(args.output_dir, "test_results.json"))
+
+
+def get_optimizer_and_scheduler(args, model, t_total):
+    # Prepare optimizer
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+         'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+         'weight_decay': 0.0}
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    return optimizer, scheduler
+
+
+if __name__ == "__main__":
+    main()
diff --git a/DynaBERT/run_glue.py b/DynaBERT/run_glue.py
index 68ce9718..e29362ee 100644
--- a/DynaBERT/run_glue.py
+++ b/DynaBERT/run_glue.py
@@ -29,12 +29,12 @@
 import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
 from tqdm import tqdm, trange
-from torch.nn import  MSELoss
+from torch.nn import MSELoss
 
 from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer,
-                                  RobertaConfig,
-                                  RobertaForSequenceClassification,
-                                  RobertaTokenizer)
+                          RobertaConfig,
+                          RobertaForSequenceClassification,
+                          RobertaTokenizer)
 
 from transformers import AdamW, WarmupLinearSchedule
 
@@ -43,11 +43,11 @@
 from transformers import glue_processors as processors
 from transformers import glue_convert_examples_to_features as convert_examples_to_features
 
-
 logger = logging.getLogger(__name__)
 CONFIG_NAME = "config.json"
 WEIGHTS_NAME = "pytorch_model.bin"
 
+
 def soft_cross_entropy(predicts, targets):
     student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1)
     targets_prob = torch.nn.functional.softmax(targets, dim=-1)
@@ -82,7 +82,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None):
 
     # Prepare optimizer and schedule (linear warmup and decay)
     if args.model_type == 'roberta':
-        args.warmup_steps = int(t_total*0.06)
+        args.warmup_steps = int(t_total * 0.06)
 
     no_decay = ['bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
@@ -90,7 +90,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None):
          'weight_decay': args.weight_decay},
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
          'weight_decay': 0.0}
-        ]
+    ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)
     scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
 
@@ -210,7 +210,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None):
                 if global_step > 0 and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     if args.evaluate_during_training:
                         acc = []
-                        if args.task_name == "mnli":   # for both MNLI-m and MNLI-mm
+                        if args.task_name == "mnli":  # for both MNLI-m and MNLI-mm
                             acc_both = []
 
                         # collect performance of all sub-networks
@@ -240,7 +240,7 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None):
                             else:
                                 print("***best***{}\n".format(acc))
                                 with open(output_eval_file, "a") as writer:
-                                    writer.write("{}\n" .format(acc))
+                                    writer.write("{}\n".format(acc))
 
                             logger.info("Saving model checkpoint to %s", args.output_dir)
                             model_to_save = model.module if hasattr(model, 'module') else model
@@ -307,11 +307,11 @@ def evaluate(args, model, tokenizer, prefix=""):
             preds = np.squeeze(preds)
         result = compute_metrics(eval_task, preds, out_label_ids)
         if eval_task == 'mnli-mm':
-            results.update({'acc_mm':result['acc']})
+            results.update({'acc_mm': result['acc']})
         else:
             results.update(result)
 
-        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") # wirte all the results to the same file
+        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")  # wirte all the results to the same file
         with open(output_eval_file, "a") as writer:
             logger.info("***** Eval results {} *****".format(prefix))
             for key in sorted(result.keys()):
@@ -322,7 +322,6 @@ def evaluate(args, model, tokenizer, prefix=""):
 
 
 def load_and_cache_examples(args, task, tokenizer, evaluate=False):
-
     processor = processors[task]()
     output_mode = output_modes[task]
     logger.info("Creating features from dataset file at %s", args.data_dir)
@@ -338,10 +337,10 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
                                             label_list=label_list,
                                             max_length=args.max_seq_length,
                                             output_mode=output_mode,
-                                            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+                                            pad_on_left=bool(args.model_type in ['xlnet']),  # pad on the left for xlnet
                                             pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                             pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
-    )
+                                            )
 
     # Convert to Tensors and build dataset
     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
@@ -405,7 +404,7 @@ def compute_neuron_head_importance(args, model, tokenizer):
         for batch in tqdm(eval_dataloader, desc="Evaluating"):
             batch = tuple(t.to(args.device) for t in batch)
             input_ids, input_mask, _, label_ids = batch
-            segment_ids = batch[2] if args.model_type=='bert' else None  # RoBERTa does't use segment_ids
+            segment_ids = batch[2] if args.model_type == 'bert' else None  # RoBERTa does't use segment_ids
 
             # calculate head importance
             outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids,
@@ -415,7 +414,8 @@ def compute_neuron_head_importance(args, model, tokenizer):
             head_importance += head_mask.grad.abs().detach()
 
             # calculate  neuron importance
-            for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight, neuron_importance):
+            for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight,
+                                                      neuron_importance):
                 current_importance += ((w1 * w1.grad).sum(dim=1) + b1 * b1.grad).abs().detach()
                 current_importance += ((w2 * w2.grad).sum(dim=0)).abs().detach()
 
@@ -515,7 +515,7 @@ def main():
     args.depth_mult_list = [float(depth) for depth in args.depth_mult_list.split(',')]
 
     # Setup CUDA, GPU & distributed training
-    device = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     args.n_gpu = torch.cuda.device_count()
     args.device = device
 
@@ -535,7 +535,7 @@ def main():
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
     config = config_class.from_pretrained(args.model_dir, num_labels=num_labels, finetuning_task=args.task_name)
-    config.output_attentions, config.output_hidden_states, config.output_intermediate = True,True,True
+    config.output_attentions, config.output_hidden_states, config.output_intermediate = True, True, True
     tokenizer = tokenizer_class.from_pretrained(args.model_dir, do_lower_case=args.do_lower_case)
 
     # load teacher model if necessary
diff --git a/DynaBERT/run_multiemo.py b/DynaBERT/run_multiemo.py
new file mode 100644
index 00000000..3eb3bd05
--- /dev/null
+++ b/DynaBERT/run_multiemo.py
@@ -0,0 +1,577 @@
+# coding=utf-8
+# 2020.08.28 - Changed regular fine-tuning to fine-tuning with adaptive width and depth
+#              Huawei Technologies Co., Ltd <houlu3@huawei.com>
+# Copyright (c) 2020, Huawei Technologies Co., Ltd.  All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors,  the HuggingFace Inc.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import random
+import math
+import time
+from datetime import timedelta
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
+from tqdm import tqdm, trange
+from torch.nn import MSELoss
+
+from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer,
+                          RobertaConfig,
+                          RobertaForSequenceClassification,
+                          RobertaTokenizer)
+
+from transformers import AdamW, WarmupLinearSchedule
+
+from transformers.data.metrics import multiemo_compute_metrics as compute_metrics
+from transformers.data.processors.multiemo import multiemo_convert_examples_to_features as convert_examples_to_features, \
+    MultiemoProcessor, multiemo_output_modes
+from utils import result_to_text_file, dictionary_to_json
+
+logger = logging.getLogger(__name__)
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+
+
+def soft_cross_entropy(predicts, targets):
+    student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1)
+    targets_prob = torch.nn.functional.softmax(targets, dim=-1)
+    return -torch.sum(targets_prob * student_likelihood, dim=-1).mean()
+
+
+loss_mse = MSELoss()
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig)), ())
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
+    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer, teacher_model=None):
+    """ Train the model """
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler,
+                                  batch_size=args.train_batch_size)
+
+    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    if args.model_type == 'roberta':
+        args.warmup_steps = int(t_total * 0.06)
+
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+         'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+         'weight_decay': 0.0}
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    global_step = 0
+    tr_loss = 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
+    set_seed(args)
+
+    current_best = 0
+    output_eval_file = os.path.join(args.output_dir, 'eval_results.txt')
+
+    for epoch in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3],
+                      'token_type_ids': batch[2] if args.model_type in ['bert'] else None}
+
+            # prepare the hidden states and logits of the teacher model
+            if args.training_phase == 'dynabertw' and teacher_model:
+                with torch.no_grad():
+                    _, teacher_logit, teacher_reps, _, _ = teacher_model(**inputs)
+            elif args.training_phase == 'dynabert' and teacher_model:
+                hidden_max_all, logits_max_all = [], []
+                for width_mult in sorted(args.width_mult_list, reverse=True):
+                    with torch.no_grad():
+                        _, teacher_logit, teacher_reps, _, _ = teacher_model(**inputs)
+                        hidden_max_all.append(teacher_reps)
+                        logits_max_all.append(teacher_logit)
+
+            # accumulate grads for all sub-networks
+            for depth_mult in sorted(args.depth_mult_list, reverse=True):
+                model.apply(lambda m: setattr(m, 'depth_mult', depth_mult))
+                # select teacher model layers for matching
+                if args.training_phase == 'dynabert' or 'final_finetuning':
+                    model = model.module if hasattr(model, 'module') else model
+                    base_model = getattr(model, model.base_model_prefix, model)
+                    n_layers = base_model.config.num_hidden_layers
+                    depth = round(depth_mult * n_layers)
+                    kept_layers_index = []
+                    for i in range(depth):
+                        kept_layers_index.append(math.floor(i / depth_mult))
+                    kept_layers_index.append(n_layers)
+
+                # adjust width
+                width_idx = 0
+                for width_mult in sorted(args.width_mult_list, reverse=True):
+                    model.apply(lambda m: setattr(m, 'width_mult', width_mult))
+                    # stage 1: width-adaptive
+                    if args.training_phase == 'dynabertw':
+                        loss, student_logit, student_reps, _, _ = model(**inputs)
+
+                        # distillation loss of logits
+                        if args.output_mode == "classification":
+                            logit_loss = soft_cross_entropy(student_logit, teacher_logit.detach())
+                        elif args.output_mode == "regression":
+                            logit_loss = 0
+
+                        # distillation loss of hidden states
+                        rep_loss = 0
+                        for student_rep, teacher_rep in zip(student_reps, teacher_reps):
+                            tmp_loss = loss_mse(student_rep, teacher_rep.detach())
+                            rep_loss += tmp_loss
+
+                        loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss
+
+                    # stage 2: width- and depth- adaptive
+                    elif args.training_phase == 'dynabert':
+                        loss, student_logit, student_reps, _, _ = model(**inputs)
+
+                        # distillation loss of logits
+                        if args.output_mode == "classification":
+                            logit_loss = soft_cross_entropy(student_logit, logits_max_all[width_idx].detach())
+                        elif args.output_mode == "regression":
+                            logit_loss = 0
+
+                        # distillation loss of hidden states
+                        rep_loss = 0
+                        for student_rep, teacher_rep in zip(
+                                student_reps, list(hidden_max_all[width_idx][i] for i in kept_layers_index)):
+                            tmp_loss = loss_mse(student_rep, teacher_rep.detach())
+                            rep_loss += tmp_loss
+
+                        loss = args.depth_lambda1 * logit_loss + args.depth_lambda2 * rep_loss  # ground+truth and distillation
+                        width_idx += 1  # move to the next width
+
+                    # stage 3: final finetuning
+                    else:
+                        loss = model(**inputs)[0]
+
+                    print(loss)
+                    if args.n_gpu > 1:
+                        loss = loss.mean()
+                    if args.gradient_accumulation_steps > 1:
+                        loss = loss / args.gradient_accumulation_steps
+
+                    loss.backward()
+
+            # clip the accumulated grad from all widths
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+            if 0 < t_total < global_step:
+                epoch_iterator.close()
+                break
+
+        # evaluate
+        if args.evaluate_during_training:
+            acc = []
+
+            # collect performance of all sub-networks
+            for depth_mult in sorted(args.depth_mult_list, reverse=True):
+                model.apply(lambda m: setattr(m, 'depth_mult', depth_mult))
+                for width_mult in sorted(args.width_mult_list, reverse=True):
+                    model.apply(lambda m: setattr(m, 'width_mult', width_mult))
+                    results = evaluate(args, model, tokenizer)
+
+                    logger.info("********** start evaluate results *********")
+                    logger.info("depth_mult: %s ", depth_mult)
+                    logger.info("width_mult: %s ", width_mult)
+                    logger.info("results: %s ", results)
+                    logger.info("********** end evaluate results *********")
+
+                    acc.append(list(results.values())[0])
+
+            result_to_save = dict()
+            result_to_save['epoch'] = epoch + 1
+            result_to_save['global_step'] = global_step
+            result_to_save['loss'] = loss
+            result_to_save['acc'] = acc
+
+            result_to_text_file(result_to_save, output_eval_file)
+
+            # save model
+            if sum(acc) > current_best:
+                current_best = sum(acc)
+
+                print("***best***{}\n".format(acc))
+                with open(output_eval_file, "a") as writer:
+                    writer.write("{}\n".format(acc))
+
+                logger.info("Saving model checkpoint to %s", args.output_dir)
+                model_to_save = model.module if hasattr(model, 'module') else model
+                model_to_save.save_pretrained(args.output_dir)
+                torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+                model_to_save.config.to_json_file(os.path.join(args.output_dir, CONFIG_NAME))
+                tokenizer.save_vocabulary(args.output_dir)
+
+        if 0 < t_total < global_step:
+            train_iterator.close()
+            break
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    """ Evaluate the model """
+    results = {}
+
+    eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    eval_sampler = SequentialSampler(eval_dataset)
+    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler,
+                                 batch_size=args.eval_batch_size)
+
+    eval_loss = 0.0
+    nb_eval_steps = 0
+    preds = None
+    out_label_ids = None
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+
+        with torch.no_grad():
+            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None
+            outputs = model(**inputs)
+            tmp_eval_loss, logits = outputs[:2]
+            eval_loss += tmp_eval_loss.mean().item()
+
+        nb_eval_steps += 1
+        if preds is None:
+            preds = logits.detach().cpu().numpy()
+            out_label_ids = inputs['labels'].detach().cpu().numpy()
+        else:
+            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+
+    if args.output_mode == "regression":
+        preds = np.squeeze(preds)
+
+    result = compute_metrics(args.task_name, preds, out_label_ids)
+    results.update(result)
+
+    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+    with open(output_eval_file, "a") as writer:
+        logger.info("***** Eval results {} *****".format(prefix))
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+        writer.write("\n")
+
+    return results
+
+
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    _, lang, domain, kind = task.split('_')
+    processor = MultiemoProcessor(lang, domain, kind)
+    output_mode = multiemo_output_modes['multiemo']
+    logger.info("Creating features from dataset file at %s", args.data_dir)
+    label_list = processor.get_labels()
+    if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
+        label_list[1], label_list[2] = label_list[2], label_list[1]
+    examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+    if not evaluate and args.data_aug:
+        examples_aug = processor.get_train_examples_aug(args.data_dir)
+        examples = examples + examples_aug
+    features = convert_examples_to_features(
+        examples,
+        tokenizer,
+        label_list=label_list,
+        max_length=args.max_seq_length,
+        output_mode=output_mode,
+        pad_on_left=bool(args.model_type in ['xlnet']),  # pad on the left for xlnet
+        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+        pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+    )
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
+    return dataset
+
+
+def compute_neuron_head_importance(args, model, tokenizer):
+    """ This method shows how to compute:
+        - neuron importance scores based on loss according to http://arxiv.org/abs/1905.10650
+    """
+    # prepare things for heads
+    model = model.module if hasattr(model, 'module') else model
+    base_model = getattr(model, model.base_model_prefix, model)
+    n_layers, n_heads = base_model.config.num_hidden_layers, base_model.config.num_attention_heads
+    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
+    head_mask = torch.ones(n_layers, n_heads).to(args.device)
+    head_mask.requires_grad_(requires_grad=True)
+
+    # collect weights
+    intermediate_weight = []
+    intermediate_bias = []
+    output_weight = []
+    for name, w in model.named_parameters():
+        if 'intermediate' in name:
+            if w.dim() > 1:
+                intermediate_weight.append(w)
+            else:
+                intermediate_bias.append(w)
+
+        if 'output' in name and 'attention' not in name:
+            if w.dim() > 1:
+                output_weight.append(w)
+
+    neuron_importance = []
+    for w in intermediate_weight:
+        neuron_importance.append(torch.zeros(w.shape[0]).to(args.device))
+
+    model.to(args.device)
+
+    eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    eval_sampler = SequentialSampler(eval_dataset)
+    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        batch = tuple(t.to(args.device) for t in batch)
+        input_ids, input_mask, _, label_ids = batch
+        segment_ids = batch[2] if args.model_type == 'bert' else None  # RoBERTa does't use segment_ids
+
+        # calculate head importance
+        outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids,
+                        head_mask=head_mask)
+        loss = outputs[0]
+        loss.backward()
+        head_importance += head_mask.grad.abs().detach()
+
+        # calculate  neuron importance
+        for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight,
+                                                  neuron_importance):
+            current_importance += ((w1 * w1.grad).sum(dim=1) + b1 * b1.grad).abs().detach()
+            current_importance += ((w2 * w2.grad).sum(dim=0)).abs().detach()
+
+    return head_importance, neuron_importance
+
+
+def reorder_neuron_head(model, head_importance, neuron_importance):
+    """ reorder neurons based on their importance.
+
+        Arguments:
+            model: bert model
+            head_importance: 12*12 matrix for head importance in 12 layers
+            neuron_importance: list for neuron importance in 12 layers.
+    """
+    model = model.module if hasattr(model, 'module') else model
+    base_model = getattr(model, model.base_model_prefix, model)
+
+    # reorder heads and ffn neurons
+    for layer, current_importance in enumerate(neuron_importance):
+        # reorder heads
+        idx = torch.sort(head_importance[layer], descending=True)[-1]
+        base_model.encoder.layer[layer].attention.reorder_heads(idx)
+        # reorder neurons
+        idx = torch.sort(current_importance, descending=True)[-1]
+        base_model.encoder.layer[layer].intermediate.reorder_neurons(idx)
+        base_model.encoder.layer[layer].output.reorder_neurons(idx)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_dir", default=None, type=str, required=True,
+                        help="The student (and teacher) model dir.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where trained model is saved.")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the multiemo task to train")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--evaluate_during_training", default=True,
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", default=True,
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--per_gpu_train_batch_size", default=32, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=2e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+    parser.add_argument("--hidden_dropout_prob", default=0.1, type=float,
+                        help="dropout rate on hidden states.")
+    parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float,
+                        help="dropout rate on attention probs.")
+
+    parser.add_argument('--data_aug', action='store_true', help="whether using data augmentation")
+    # for depth direction
+    parser.add_argument('--depth_mult_list', type=str, default='1.',
+                        help="the possible depths used for training, e.g., '1.' is for default")
+    parser.add_argument("--depth_lambda1", default=1.0, type=float,
+                        help="logit matching coef.")
+    parser.add_argument("--depth_lambda2", default=1.0, type=float,
+                        help="hidden states matching coef.")
+    # for width direction
+    parser.add_argument('--width_mult_list', type=str, default='1.',
+                        help="the possible widths used for training, e.g., '1.' is for separate training "
+                             "while '0.25,0.5,0.75,1.0' is for vanilla slimmable training")
+    parser.add_argument("--width_lambda1", default=1.0, type=float,
+                        help="logit matching coef.")
+    parser.add_argument("--width_lambda2", default=0.1, type=float,
+                        help="hidden states matching coef.")
+
+    parser.add_argument("--training_phase", default="dynabertw", type=str,
+                        help="can be finetuning, dynabertw, dynabert, final_finetuning")
+
+    args = parser.parse_args()
+
+    args.width_mult_list = [float(width) for width in args.width_mult_list.split(',')]
+    args.depth_mult_list = [float(depth) for depth in args.depth_mult_list.split(',')]
+
+    # Setup CUDA, GPU & distributed training
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    args.n_gpu = torch.cuda.device_count()
+    args.device = device
+
+    # Set seed
+    set_seed(args)
+
+    # Prepare MULTIEMO task: provide num_labels here
+    args.task_name = args.task_name.lower()
+    if 'multiemo' not in args.task_name:
+        raise ValueError("Task not found: %s" % args.task_name)
+    _, lang, domain, kind = args.task_name.split('_')
+    processor = MultiemoProcessor(lang, domain, kind)
+    args.output_mode = multiemo_output_modes['multiemo']
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # prepare model, tokernizer and config
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.model_dir, num_labels=num_labels, finetuning_task=args.task_name)
+    config.output_attentions, config.output_hidden_states, config.output_intermediate = True, True, True
+    tokenizer = tokenizer_class.from_pretrained(args.model_dir, do_lower_case=args.do_lower_case)
+
+    # load teacher model if necessary
+    if args.training_phase == 'dynabertw' or args.training_phase == 'dynabert':
+        teacher_model = model_class.from_pretrained(args.model_dir, config=config)
+        teacher_model.to(args.device)
+    else:
+        teacher_model = None
+
+    # load student model if necessary
+    model = model_class.from_pretrained(args.model_dir, config=config)
+
+    if args.training_phase == 'dynabertw':
+        # rewire the network according to the importance of attention heads and neurons
+        head_importance, neuron_importance = compute_neuron_head_importance(args, model, tokenizer)
+        reorder_neuron_head(model, head_importance, neuron_importance)
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        training_start_time = time.monotonic()
+
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        if teacher_model:
+            global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher_model)
+        else:
+            global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+        # Measure End Time
+        training_end_time = time.monotonic()
+
+        diff = timedelta(seconds=training_end_time - training_start_time)
+        diff_seconds = diff.total_seconds()
+
+        training_parameters = vars(args)
+        training_parameters['training_time'] = diff_seconds
+
+        output_training_params_file = os.path.join(args.output_dir, "training_params.json")
+        dictionary_to_json(training_parameters, output_training_params_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/DynaBERT/scripts/download_dataset.py b/DynaBERT/scripts/download_dataset.py
new file mode 100644
index 00000000..701ffd3a
--- /dev/null
+++ b/DynaBERT/scripts/download_dataset.py
@@ -0,0 +1,55 @@
+import argparse
+import os
+import zipfile
+
+import requests
+from tqdm.auto import tqdm
+
+# url = 'https://clarin-pl.eu/dspace/bitstream/handle/11321/798/multiemo.zip?sequence=2&isAllowed=y'
+url = 'https://clarin-pl.eu/dspace/handle/11321/798/allzip'
+
+
+def main(data_dir):
+    output_zip = os.path.join(
+        data_dir,
+        'MultiEmo_ Multilingual, Multilevel, Multidomain Sentiment Analysis Corpus of Consumer Reviews.zip')
+
+    response = requests.get(url, stream=True)
+
+    if response.status_code == 200:
+        total_size_in_bytes = int(response.headers.get('content-length', 0))
+        block_size = 1024
+        progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+        with open(output_zip, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=block_size):
+                if chunk:
+                    progress_bar.update(len(chunk))
+                    f.write(chunk)
+
+        progress_bar.close()
+        if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+            print("ERROR, something went wrong")
+
+    with zipfile.ZipFile(output_zip, "r") as zip_ref:
+        zip_ref.extractall(data_dir)
+
+    os.remove(output_zip)
+    os.remove(os.path.join(data_dir, 'multiemo.7z'))
+
+    data_output_zip = os.path.join(data_dir, 'multiemo.zip')
+    with zipfile.ZipFile(data_output_zip, "r") as zip_ref:
+        zip_ref.extractall(data_dir)
+
+    os.remove(data_output_zip)
+    os.remove(os.path.join(data_dir, 'README.txt'))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='multiemo2')
+    args = parser.parse_args()
+
+    if not os.path.isdir(args.data_dir):
+        os.mkdir(args.data_dir)
+
+    main(data_dir=args.data_dir)
diff --git a/DynaBERT/transformers/data/metrics/__init__.py b/DynaBERT/transformers/data/metrics/__init__.py
index 942bd7b1..00d321ad 100644
--- a/DynaBERT/transformers/data/metrics/__init__.py
+++ b/DynaBERT/transformers/data/metrics/__init__.py
@@ -18,6 +18,8 @@
 import sys
 import logging
 
+import numpy as np
+
 logger = logging.getLogger(__name__)
 
 try:
diff --git a/DynaBERT/utils.py b/DynaBERT/utils.py
new file mode 100644
index 00000000..5decae1e
--- /dev/null
+++ b/DynaBERT/utils.py
@@ -0,0 +1,33 @@
+import json
+import logging
+import os
+import sys
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt='%d/%m/%Y %H:%M:%S')
+logger = logging.getLogger(__name__)
+
+
+def result_to_text_file(result: dict, file_name: str, verbose: bool = True) -> None:
+    with open(file_name, "a") as writer:
+        for key in sorted(result.keys()):
+            writer.write("%s = %s\n" % (key, str(result[key])))
+        writer.write("")
+
+
+def dictionary_to_json(dictionary: dict, file_name: str):
+    with open(file_name, "w") as f:
+        json.dump(dictionary, f, indent=2)
+
+
+def is_folder_empty(folder_name: str):
+    if len([f for f in os.listdir(folder_name) if not f.startswith('.')]) == 0:
+        return True
+    else:
+        return False
+
+
+def get_immediate_subdirectories(directory: str):
+    return [os.path.join(directory, name) for name in os.listdir(directory)
+            if os.path.isdir(os.path.join(directory, name))]
\ No newline at end of file

From a0fb1f2f4bb0200a2811c36b18049efe8e14da29 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 30 Nov 2021 16:20:36 +0100
Subject: [PATCH 43/62] feat: add evaluation on test dataset

---
 DynaBERT/eval_multiemo.py                     | 372 ++++++++++++++++++
 .../transformers/data/processors/multiemo.py  |  25 +-
 2 files changed, 386 insertions(+), 11 deletions(-)
 create mode 100644 DynaBERT/eval_multiemo.py

diff --git a/DynaBERT/eval_multiemo.py b/DynaBERT/eval_multiemo.py
new file mode 100644
index 00000000..ead563b9
--- /dev/null
+++ b/DynaBERT/eval_multiemo.py
@@ -0,0 +1,372 @@
+# coding=utf-8
+# 2020.08.28 - Changed regular evaluation to evaluation with adaptive width and depth
+#              Huawei Technologies Co., Ltd <houlu3@huawei.com>
+# Copyright (c) 2020, Huawei Technologies Co., Ltd.  All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import random
+import time
+from datetime import timedelta
+
+import numpy as np
+import torch
+from sklearn.metrics import classification_report
+from torch.utils.data import (DataLoader, SequentialSampler, TensorDataset)
+from tqdm import tqdm
+
+from transformers import (BertConfig,
+                          BertForSequenceClassification, BertTokenizer,
+                          RobertaConfig,
+                          RobertaForSequenceClassification,
+                          RobertaTokenizer,
+                          )
+
+from transformers.data.metrics import multiemo_compute_metrics as compute_metrics
+from transformers.data.processors.multiemo import multiemo_convert_examples_to_features as convert_examples_to_features, \
+    MultiemoProcessor, multiemo_output_modes
+from utils import dictionary_to_json
+
+logger = logging.getLogger(__name__)
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id, seq_length=None):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.seq_length = seq_length
+        self.label_id = label_id
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def convert_examples_to_features_test(examples, label_list, max_seq_length,
+                                      tokenizer, output_mode):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[:(max_seq_length - 2)]
+
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        input_mask = [1] * len(input_ids)
+        seq_length = len(input_ids)
+
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        try:
+            if output_mode == "classification":
+                label_id = label_map[example.label]
+            elif output_mode == "regression":
+                label_id = float(example.label)
+            else:
+                raise KeyError(output_mode)
+        except:
+            label_id = 0
+
+        if ex_index < 1:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join(
+                [str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info(
+                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: {}".format(example.label))
+            logger.info("label_id: {}".format(label_id))
+
+        features.append(
+            InputFeatures(input_ids=input_ids,
+                          input_mask=input_mask,
+                          segment_ids=segment_ids,
+                          label_id=label_id,
+                          seq_length=seq_length))
+    return features
+
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
+    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    results = {}
+    eval_dataset = load_and_cache_examples_test(args, args.task_name, tokenizer)
+
+    eval_output_dir = os.path.join(
+        args.output_dir, args.model_type + '_' + args.width_mult + '_' + args.depth_mult + '_eval')
+
+    if not os.path.exists(eval_output_dir):
+        # and args.local_rank in [-1, 0]:
+        os.makedirs(eval_output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    eval_sampler = SequentialSampler(eval_dataset)
+    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(eval_dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    eval_loss = 0.0
+    nb_eval_steps = 0
+    preds = None
+    out_label_ids = None
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+
+        with torch.no_grad():
+            inputs = {'input_ids': batch[0],
+                      'attention_mask': batch[1],
+                      'labels': batch[3]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] \
+                    else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+            outputs = model(**inputs)
+
+            tmp_eval_loss, logits = outputs[:2]
+            eval_loss += tmp_eval_loss.mean().item()
+
+        nb_eval_steps += 1
+        if preds is None:
+            preds = logits.detach().cpu().numpy()
+            out_label_ids = inputs['labels'].detach().cpu().numpy()
+        else:
+            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+
+    if args.output_mode == "regression":
+        preds = np.squeeze(preds)
+
+    result = compute_metrics(args.task_name, preds, out_label_ids)
+    results.update(result)
+    output_eval_file = os.path.join(eval_output_dir, "test_results_{0}.txt".format(args.task_name))
+
+    with open(output_eval_file, "a") as writer:
+        logger.info("***** Eval results {} *****".format(prefix))
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+        writer.write("\n")
+
+    return results, preds, out_label_ids
+
+
+def get_tensor_data(output_mode, features):
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_label_ids, all_seq_lengths)
+    return tensor_data, all_label_ids
+
+
+def load_and_cache_examples_test(args, task, tokenizer):
+    _, lang, domain, kind = args.task_name.split('_')
+    processor = MultiemoProcessor(lang, domain, kind)
+    output_mode = multiemo_output_modes['multiemo']
+    label_list = processor.get_labels()
+
+    examples = processor.get_test_examples(args.data_dir)
+    features = convert_examples_to_features_test(examples, label_list, args.max_seq_length, tokenizer, output_mode)
+    data, labels = get_tensor_data(output_mode, features)
+    return data, label_list
+
+
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    _, lang, domain, kind = args.task_name.split('_')
+    processor = MultiemoProcessor(lang, domain, kind)
+    output_mode = multiemo_output_modes['multiemo']
+
+    logger.info("Creating features from dataset file at %s", args.data_dir)
+    label_list = processor.get_labels()
+
+    examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+
+    features = convert_examples_to_features(
+        examples,
+        tokenizer,
+        label_list=label_list,
+        max_length=args.max_seq_length,
+        output_mode=output_mode,
+        pad_on_left=bool(args.model_type in ['xlnet']),  # pad on the left for xlnet
+        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+        pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+    )
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train selected")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions will be written.")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--do_lower_case", default=True,
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=128, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+    parser.add_argument("--model_dir", type=str,
+                        help="The teacher model dir.")
+    parser.add_argument('--depth_mult', type=str, default='1.',
+                        help="the possible depths used for training, e.g., '1.' is for default")
+    parser.add_argument('--width_mult', type=str, default='1.',
+                        help="the possible depths used for training, e.g., '1.' is for default")
+
+    args = parser.parse_args()
+    args.model_dir = os.path.join(args.model_dir, 'best')
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    args.n_gpu = torch.cuda.device_count()
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt='%m/%d/%Y %H:%M:%S',
+                        level=logging.INFO)
+    logger.warning("device: %s, n_gpu: %s", device, args.n_gpu, )
+
+    # Set seed
+    set_seed(args)
+
+    # Prepare MULTIEMO task: provide num_labels here
+    args.task_name = args.task_name.lower()
+    if 'multiemo' not in args.task_name:
+        raise ValueError("Task not found: %s" % args.task_name)
+
+    _, lang, domain, kind = args.task_name.split('_')
+    processor = MultiemoProcessor(lang, domain, kind)
+    args.output_mode = multiemo_output_modes['multiemo']
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+
+    config = config_class.from_pretrained(args.model_dir, num_labels=num_labels, finetuning_task=args.task_name)
+    tokenizer = tokenizer_class.from_pretrained(args.model_dir, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_dir, config=config)
+    model.to(args.device)
+    model.apply(lambda m: setattr(m, 'depth_mult', float(args.depth_mult)))
+    model.apply(lambda m: setattr(m, 'width_mult', float(args.width_mult)))
+
+    eval_start_time = time.monotonic()
+
+    results, y_logits, y_true = evaluate(args, model, tokenizer)
+    print(results)
+
+    eval_end_time = time.monotonic()
+
+    diff = timedelta(seconds=eval_end_time - eval_start_time)
+    diff_seconds = diff.total_seconds()
+
+    y_pred = np.argmax(y_logits, axis=1)
+    print('\n\t**** Classification report ****\n')
+    print(classification_report(y_true, y_pred, target_names=label_list))
+
+    report = classification_report(y_true, y_pred, target_names=label_list, output_dict=True)
+    report['eval_time'] = diff_seconds
+
+    eval_output_dir = os.path.join(
+        args.output_dir, args.model_type + '_' + args.width_mult + '_' + args.depth_mult + '_eval')
+    dictionary_to_json(report, os.path.join(eval_output_dir, "test_results.json"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/DynaBERT/transformers/data/processors/multiemo.py b/DynaBERT/transformers/data/processors/multiemo.py
index be4a0eb2..e7cb3142 100644
--- a/DynaBERT/transformers/data/processors/multiemo.py
+++ b/DynaBERT/transformers/data/processors/multiemo.py
@@ -17,6 +17,8 @@
 
 import logging
 import os
+from typing import List
+
 import numpy as np
 from .utils import DataProcessor, InputExample, InputFeatures
 from ...file_utils import is_tf_available
@@ -27,15 +29,16 @@
 logger = logging.getLogger(__name__)
 
 
-def multiemo_convert_examples_to_features(examples, tokenizer,
-                                          max_length=512,
-                                          task=None,
-                                          label_list=None,
-                                          output_mode=None,
-                                          pad_on_left=False,
-                                          pad_token=0,
-                                          pad_token_segment_id=0,
-                                          mask_padding_with_zero=True):
+def multiemo_convert_examples_to_features(
+        examples, tokenizer,
+        max_length=512,
+        task=None,
+        label_list=None,
+        output_mode=None,
+        pad_on_left=False,
+        pad_token=0,
+        pad_token_segment_id=0,
+        mask_padding_with_zero=True):
     """
     Loads a data file into a list of ``InputFeatures``
 
@@ -61,13 +64,13 @@ def multiemo_convert_examples_to_features(examples, tokenizer,
     """
 
     if task is not None:
-        _, lang, domain, kind = task_name.split('_')
+        _, lang, domain, kind = task.split('_')
         processor = MultiemoProcessor(lang, domain, kind)
         if label_list is None:
             label_list = processor.get_labels()
             logger.info("Using label list %s for task %s" % (label_list, task))
         if output_mode is None:
-            output_mode = multiemo_output_modes_output_modes[task]
+            output_mode = multiemo_output_modes[task]
             logger.info("Using output mode %s for task %s" % (output_mode, task))
 
     label_map = {label: i for i, label in enumerate(label_list)}

From 0fc35b6354c236af984f9ae9e432466e0ee43ab9 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 30 Nov 2021 17:03:48 +0100
Subject: [PATCH 44/62] feat: add script running experiments

---
 DynaBERT/run_experiments.py | 149 ++++++++++++++++++++++++++++++++++++
 DynaBERT/run_multiemo.py    |   2 -
 2 files changed, 149 insertions(+), 2 deletions(-)
 create mode 100644 DynaBERT/run_experiments.py

diff --git a/DynaBERT/run_experiments.py b/DynaBERT/run_experiments.py
new file mode 100644
index 00000000..6f821b19
--- /dev/null
+++ b/DynaBERT/run_experiments.py
@@ -0,0 +1,149 @@
+import logging
+import os
+import sys
+
+PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__))
+DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data')
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt='%d/%m/%Y %H:%M:%S')
+logger = logging.getLogger(__name__)
+
+data_dir = os.path.join('data', 'multiemo2')
+
+batch_size = 16
+num_train_epochs = 3
+learning_rate = 5e-5
+weight_decay = 0.01
+
+
+def main():
+    print(PROJECT_FOLDER)
+    os.chdir(PROJECT_FOLDER)
+
+    if not os.path.exists(os.path.join(DATA_FOLDER, 'multiemo2')):
+        logger.info("Downloading Multiemo data")
+        cmd = 'python3 scripts/download_dataset.py --data_dir data/multiemo2'
+        run_process(cmd)
+        logger.info("Downloading finished")
+
+    if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased')):
+        logger.info("Downloading bert-base-uncased model")
+        cmd = 'python3 download_bert_base.py'
+        run_process(cmd)
+        logger.info("Downloading finished")
+
+    if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'bert-base-uncased', 'multiemo_en_all_sentence')):
+        cmd = 'python3 multiemo_fine_tune_bert.py '
+        options = [
+            '--pretrained_model', 'data/models/bert-base-uncased',
+            '--data_dir', 'data/multiemo2',
+            '--task_name', 'multiemo_en_all_sentence',
+            '--output_dir', 'data/models/bert-base-uncased/multiemo_en_all_sentence',
+            '--learning_rate', str(learning_rate),
+            '--num_train_epochs', str(num_train_epochs),
+            '--weight_decay', str(weight_decay),
+            '--train_batch_size', str(batch_size),
+            '--do_lower_case'
+        ]
+        cmd += ' '.join(options)
+        logger.info(f"Training bert-base-uncased for multiemo_en_all_sentence")
+        run_process(cmd)
+
+    if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'dynabertw', 'multiemo_en_all_sentence')):
+        cmd = 'python3 run_multiemo.py '
+        options = [
+            '--model_type', 'bert',
+            '--task_name', 'multiemo_en_all_sentence',
+            '--do_train',
+            '--data_dir', 'data/multiemo2',
+            '--model_dir ', 'data/models/bert-base-uncased/multiemo_en_all_sentence',
+            '--output_dir', 'data/models/dynabertw/multiemo_en_all_sentence',
+            '--max_seq_length', str(128),
+            '--learning_rate', str(learning_rate),
+            '--num_train_epochs', str(num_train_epochs),
+            '--per_gpu_train_batch_size', str(batch_size),
+            '--weight_decay', str(weight_decay),
+            '--width_mult_list', '0.25,0.5,0.75,1.0',
+            '--width_lambda1', str(1.0),
+            '--width_lambda2', str(0.1),
+            '--training_phase', 'dynabertw'
+        ]
+        cmd += ' '.join(options)
+        logger.info(f"Training DynaBERT_W for multiemo_en_all_sentence")
+        run_process(cmd)
+
+    if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'dynabert', 'multiemo_en_all_sentence')):
+        cmd = 'python3 run_multiemo.py '
+        options = [
+            '--model_type', 'bert',
+            '--task_name', 'multiemo_en_all_sentence',
+            '--do_train',
+            '--data_dir', 'data/multiemo2',
+            '--model_dir ', 'data/models/dynabertw/multiemo_en_all_sentence',
+            '--output_dir', 'data/models/dynabert/multiemo_en_all_sentence',
+            '--max_seq_length', str(128),
+            '--learning_rate', str(learning_rate),
+            '--num_train_epochs', str(num_train_epochs),
+            '--per_gpu_train_batch_size', str(batch_size),
+            '--weight_decay', str(weight_decay),
+            '--width_mult_list', '0.25,0.5,0.75,1.0',
+            '--depth_mult_list', '0.5,0.75,1.0',
+            '--width_lambda1', str(1.0),
+            '--width_lambda2', str(1.0),
+            '--training_phase', 'dynabert',
+        ]
+        cmd += ' '.join(options)
+        logger.info(f"Training DynaBERT for multiemo_en_all_sentence")
+        run_process(cmd)
+
+    if not os.path.exists(os.path.join(DATA_FOLDER, 'models', 'dynabert-finetuned', 'multiemo_en_all_sentence')):
+        cmd = 'python3 run_multiemo.py '
+        options = [
+            '--model_type', 'bert',
+            '--task_name', 'multiemo_en_all_sentence',
+            '--do_train',
+            '--data_dir', 'data/multiemo2',
+            '--model_dir ', 'data/models/dynabertw/multiemo_en_all_sentence',
+            '--output_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence',
+            '--max_seq_length', str(128),
+            '--learning_rate', str(learning_rate),
+            '--num_train_epochs', str(num_train_epochs),
+            '--per_gpu_train_batch_size', str(batch_size),
+            '--weight_decay', str(weight_decay),
+            '--width_mult_list', '0.25,0.5,0.75,1.0',
+            '--depth_mult_list', '0.5,0.75,1.0',
+            '--training_phase', 'final_finetuning ',
+        ]
+        cmd += ' '.join(options)
+        logger.info(f"Finetuning DynaBERT for multiemo_en_all_sentence")
+        run_process(cmd)
+
+
+    cmd = 'python3 eval_multiemo.py '
+    options = [
+        '--model_type', 'bert',
+        '--task_name', 'multiemo_en_all_sentence',
+        '--data_dir', 'data/multiemo2',
+        '--model_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence'
+        '--output_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence'
+        '--max_seq_length', str(128),
+        '--depth_mult', '0.5'
+    ]
+    cmd += ' '.join(options)
+    logger.info(f"Evaluating DynaBERT for multiemo_en_all_sentence")
+    run_process(cmd)
+
+
+    # cmd = f'python3 -m gather_results --task_name multiemo_en_all_sentence'
+    # logger.info(f"Gathering results to csv for multiemo_en_all_sentence")
+    # run_process(cmd)
+
+
+def run_process(proc):
+    os.system(proc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/DynaBERT/run_multiemo.py b/DynaBERT/run_multiemo.py
index 3eb3bd05..30cb4c28 100644
--- a/DynaBERT/run_multiemo.py
+++ b/DynaBERT/run_multiemo.py
@@ -467,8 +467,6 @@ def main():
                         help="Total number of training epochs to perform.")
     parser.add_argument("--warmup_steps", default=0, type=int,
                         help="Linear warmup over warmup_steps.")
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
     parser.add_argument('--seed', type=int, default=42,
                         help="random seed for initialization")
     parser.add_argument("--hidden_dropout_prob", default=0.1, type=float,

From 045ababe671b3017a4e7d21ae66a81784426c481 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 30 Nov 2021 19:06:37 +0100
Subject: [PATCH 45/62] feat: fix typing

---
 DynaBERT/transformers/data/processors/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DynaBERT/transformers/data/processors/utils.py b/DynaBERT/transformers/data/processors/utils.py
index 1556ef02..d1178364 100644
--- a/DynaBERT/transformers/data/processors/utils.py
+++ b/DynaBERT/transformers/data/processors/utils.py
@@ -18,6 +18,7 @@
 import sys
 import copy
 import json
+from typing import List
 
 
 class InputExample(object):

From b82f0d31b0e83f7e388648cf075a89369bf87f80 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 30 Nov 2021 19:23:11 +0100
Subject: [PATCH 46/62] feat: add regex to requirements

---
 TernaryBERT/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TernaryBERT/requirements.txt b/TernaryBERT/requirements.txt
index 7dc69627..52e38093 100644
--- a/TernaryBERT/requirements.txt
+++ b/TernaryBERT/requirements.txt
@@ -8,4 +8,5 @@ numpy~=1.21.2
 pandas~=1.3.3
 scikit-learn~=1.0
 tqdm
+regex
 # torch==1.1.0

From 76c9611b3c5d409e94a20f949ba80b08cf13f326 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 30 Nov 2021 19:47:23 +0100
Subject: [PATCH 47/62] feat: add minor corrects

---
 DynaBERT/run_multiemo.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/DynaBERT/run_multiemo.py b/DynaBERT/run_multiemo.py
index 30cb4c28..9d1459c0 100644
--- a/DynaBERT/run_multiemo.py
+++ b/DynaBERT/run_multiemo.py
@@ -189,7 +189,6 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None):
                     else:
                         loss = model(**inputs)[0]
 
-                    print(loss)
                     if args.n_gpu > 1:
                         loss = loss.mean()
                     if args.gradient_accumulation_steps > 1:
@@ -388,7 +387,7 @@ def compute_neuron_head_importance(args, model, tokenizer):
     eval_sampler = SequentialSampler(eval_dataset)
     eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+    for batch in tqdm(eval_dataloader, desc="Evaluating for determining importance"):
         batch = tuple(t.to(args.device) for t in batch)
         input_ids, input_mask, _, label_ids = batch
         segment_ids = batch[2] if args.model_type == 'bert' else None  # RoBERTa does't use segment_ids

From 3f7546bdf8915c9b5c241e5b03edf74699e3ff32 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 30 Nov 2021 20:44:49 +0100
Subject: [PATCH 48/62] feat: add skilearn imports

---
 DynaBERT/transformers/data/metrics/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/DynaBERT/transformers/data/metrics/__init__.py b/DynaBERT/transformers/data/metrics/__init__.py
index 00d321ad..8893d540 100644
--- a/DynaBERT/transformers/data/metrics/__init__.py
+++ b/DynaBERT/transformers/data/metrics/__init__.py
@@ -24,7 +24,8 @@
 
 try:
     from scipy.stats import pearsonr, spearmanr
-    from sklearn.metrics import matthews_corrcoef, f1_score
+    from sklearn.metrics import matthews_corrcoef, f1_score, accuracy_score
+
     _has_sklearn = True
 except (AttributeError, ImportError) as e:
     logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")

From 8f21d8e74a29166ecfd12b853fff2d8ec3cceef8 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Tue, 30 Nov 2021 22:49:11 +0100
Subject: [PATCH 49/62] feat: coorect sabing results

---
 DynaBERT/run_multiemo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DynaBERT/run_multiemo.py b/DynaBERT/run_multiemo.py
index 9d1459c0..bf5d7d08 100644
--- a/DynaBERT/run_multiemo.py
+++ b/DynaBERT/run_multiemo.py
@@ -241,8 +241,6 @@ def train(args, train_dataset, model, tokenizer, teacher_model=None):
                 current_best = sum(acc)
 
                 print("***best***{}\n".format(acc))
-                with open(output_eval_file, "a") as writer:
-                    writer.write("{}\n".format(acc))
 
                 logger.info("Saving model checkpoint to %s", args.output_dir)
                 model_to_save = model.module if hasattr(model, 'module') else model
@@ -567,6 +565,8 @@ def main():
         training_parameters['training_time'] = diff_seconds
 
         output_training_params_file = os.path.join(args.output_dir, "training_params.json")
+
+        training_parameters.pop('device')
         dictionary_to_json(training_parameters, output_training_params_file)
 
 

From ba9908ea8ac7d3b81fcf85063c1a4de450b56981 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 1 Dec 2021 09:08:18 +0100
Subject: [PATCH 50/62] feat: coorect arguments of evaluating

---
 DynaBERT/run_experiments.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DynaBERT/run_experiments.py b/DynaBERT/run_experiments.py
index 6f821b19..3a1b448f 100644
--- a/DynaBERT/run_experiments.py
+++ b/DynaBERT/run_experiments.py
@@ -126,8 +126,8 @@ def main():
         '--model_type', 'bert',
         '--task_name', 'multiemo_en_all_sentence',
         '--data_dir', 'data/multiemo2',
-        '--model_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence'
-        '--output_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence'
+        '--model_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence',
+        '--output_dir', 'data/models/dynabert-finetuned/multiemo_en_all_sentence',
         '--max_seq_length', str(128),
         '--depth_mult', '0.5'
     ]

From d62c3038e7088e191f7b22aa530e4221cd83142a Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 1 Dec 2021 09:10:57 +0100
Subject: [PATCH 51/62] feat: correct model dir

---
 DynaBERT/eval_multiemo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DynaBERT/eval_multiemo.py b/DynaBERT/eval_multiemo.py
index ead563b9..f589e318 100644
--- a/DynaBERT/eval_multiemo.py
+++ b/DynaBERT/eval_multiemo.py
@@ -311,7 +311,7 @@ def main():
                         help="the possible depths used for training, e.g., '1.' is for default")
 
     args = parser.parse_args()
-    args.model_dir = os.path.join(args.model_dir, 'best')
+    # args.model_dir = os.path.join(args.model_dir, 'best')
     device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
     args.n_gpu = torch.cuda.device_count()
     args.device = device

From 1938cb438b80243fba689d6d1f3d78b1ba7dac9a Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Wed, 1 Dec 2021 09:21:15 +0100
Subject: [PATCH 52/62] feat: correct loading test data

---
 DynaBERT/eval_multiemo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DynaBERT/eval_multiemo.py b/DynaBERT/eval_multiemo.py
index f589e318..b78b9bc4 100644
--- a/DynaBERT/eval_multiemo.py
+++ b/DynaBERT/eval_multiemo.py
@@ -158,7 +158,7 @@ def set_seed(args):
 
 def evaluate(args, model, tokenizer, prefix=""):
     results = {}
-    eval_dataset = load_and_cache_examples_test(args, args.task_name, tokenizer)
+    eval_dataset, _ = load_and_cache_examples_test(args, args.task_name, tokenizer)
 
     eval_output_dir = os.path.join(
         args.output_dir, args.model_type + '_' + args.width_mult + '_' + args.depth_mult + '_eval')

From c112a595246334b060444a0811d21485d2ef0e1f Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Thu, 9 Dec 2021 14:28:15 +0100
Subject: [PATCH 53/62] feat: add gathering results for ternarybert

---
 TernaryBERT/eval_quant_multiemo.py | 230 +++++++++++++++++++++++++++++
 TernaryBERT/gather_results.py      | 102 +++++++++++++
 TernaryBERT/quant_task_multiemo.py |  63 +++++---
 TernaryBERT/run_experiments.py     |  17 +++
 4 files changed, 391 insertions(+), 21 deletions(-)
 create mode 100644 TernaryBERT/eval_quant_multiemo.py
 create mode 100644 TernaryBERT/gather_results.py

diff --git a/TernaryBERT/eval_quant_multiemo.py b/TernaryBERT/eval_quant_multiemo.py
new file mode 100644
index 00000000..0312ecee
--- /dev/null
+++ b/TernaryBERT/eval_quant_multiemo.py
@@ -0,0 +1,230 @@
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import random
+import time
+from datetime import timedelta
+
+import torch
+from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
+from torch.nn import CrossEntropyLoss, MSELoss
+from sklearn.metrics import classification_report
+from tqdm import tqdm
+
+from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification
+from transformer import BertTokenizer
+from transformer import BertConfig
+from utils_multiemo import *
+from utils import dictionary_to_json, result_to_text_file
+
+log_format = '%(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt='%m/%d %I:%M:%S %p')
+logger = logging.getLogger()
+
+
+def get_tensor_data(output_mode, features):
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    all_seq_lengths = torch.tensor([f.seq_length for f in features], dtype=torch.long)
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    tensor_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_seq_lengths)
+    return tensor_data, all_label_ids
+
+
+def do_eval(model, task_name, eval_dataloader,
+            device, output_mode, eval_labels, num_labels):
+    eval_loss = 0
+    nb_eval_steps = 0
+    all_logits = None
+
+    for batch_ in tqdm(eval_dataloader):
+        batch_ = tuple(t.to(device) for t in batch_)
+        with torch.no_grad():
+            input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_
+            logits, _, _ = model(input_ids, segment_ids, input_mask)
+
+        # create eval loss and other metric required by the task
+        if output_mode == "classification":
+            loss_fct = CrossEntropyLoss()
+            tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
+        elif output_mode == "regression":
+            loss_fct = MSELoss()
+            tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
+
+        eval_loss += tmp_eval_loss.mean().item()
+        nb_eval_steps += 1
+
+        if all_logits is None:
+            all_logits = logits.detach().cpu().numpy()
+        else:
+            all_logits = np.append(all_logits, logits.detach().cpu().numpy(), axis=0)
+
+    eval_loss = eval_loss / nb_eval_steps
+
+    if output_mode == "regression":
+        all_logits = np.squeeze(all_logits)
+    result = compute_metrics(task_name, all_logits, eval_labels.numpy())
+    result['eval_loss'] = eval_loss
+    return result, all_logits
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir",
+                        default='data',
+                        type=str,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_dir",
+                        default='models/tinybert',
+                        type=str,
+                        help="The model dir.")
+    parser.add_argument("--task_name",
+                        type=str,
+                        help="The name of the task to train.")
+    parser.add_argument("--output_dir",
+                        default='output',
+                        type=str,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument("--weight_bits",
+                        default=2,
+                        type=int,
+                        choices=[2, 8],
+                        help="Quantization bits for weight.")
+    parser.add_argument("--input_bits",
+                        default=8,
+                        type=int,
+                        help="Quantization bits for activation.")
+    parser.add_argument("--clip_val",
+                        default=2.5,
+                        type=float,
+                        help="Initial clip value.")
+
+    args = parser.parse_args()
+    task_name = args.task_name.lower()
+    data_dir = args.data_dir
+
+    model_dir = os.path.join(args.model_dir, task_name)
+    output_dir = os.path.join(args.output_dir, task_name)
+    os.makedirs(output_dir, exist_ok=True)
+
+    output_modes = {
+        "multiemo": "classification"
+    }
+
+    default_params = {
+        "multiemo": {"max_seq_length": 128, "batch_size": 16}
+    }
+
+    # Prepare devices
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+
+    # Prepare seed
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if task_name in default_params:
+        args.batch_size = default_params[task_name]["batch_size"]
+        if n_gpu > 0:
+            args.batch_size = int(args.batch_size * n_gpu)
+        args.max_seq_length = default_params[task_name]["max_seq_length"]
+    elif 'multiemo' in task_name:
+        args.batch_size = default_params['multiemo']["batch_size"]
+        if n_gpu > 0:
+            args.batch_size = int(args.batch_size * n_gpu)
+        args.max_seq_length = default_params['multiemo']["max_seq_length"]
+
+    if 'multiemo' in task_name:
+        _, lang, domain, kind = task_name.split('_')
+        processor = MultiemoProcessor(lang, domain, kind)
+    else:
+        raise ValueError("Task not found: %s" % task_name)
+
+    if 'multiemo' in task_name:
+        output_mode = output_modes['multiemo']
+    else:
+        raise ValueError("Task not found: %s" % task_name)
+
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=args.do_lower_case)
+
+    #########################
+    #       Test model      #
+    #########################
+    test_examples = processor.get_test_examples(data_dir)
+    test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer,
+                                                 output_mode)
+
+    test_data, test_labels = get_tensor_data(output_mode, test_features)
+    test_sampler = SequentialSampler(test_data)
+    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.batch_size)
+
+    config = BertConfig.from_pretrained(
+        model_dir,
+        quantize_act=True,
+        weight_bits=args.weight_bits,
+        input_bits=args.input_bits,
+        clip_val=args.clip_val
+    )
+    model = QuantBertForSequenceClassification.from_pretrained(model_dir, config=config, num_labels=num_labels)
+    model.to(device)
+
+    model_quant_dir = os.path.join(model_dir, 'quant')
+    qunat_config = BertConfig.from_pretrained(
+        model_quant_dir,
+        quantize_act=True,
+        weight_bits=args.weight_bits,
+        input_bits=args.input_bits,
+        clip_val=args.clip_val
+    )
+    quant_model = QuantBertForSequenceClassification.from_pretrained(model_quant_dir, config=qunat_config,
+                                                                     num_labels=num_labels)
+    quant_model.to(device)
+
+    output_quant_dir = os.path.join(output_dir, 'quant')
+    for m, out_dir in zip([model, quant_model], [output_dir, output_quant_dir]):
+        logger.info("\n***** Running evaluation on test dataset *****")
+        logger.info("  Num examples = %d", len(test_features))
+        logger.info("  Batch size = %d", args.batch_size)
+
+        eval_start_time = time.monotonic()
+        m.eval()
+        result, y_logits = do_eval(m, task_name, test_dataloader,
+                                   device, output_mode, test_labels, num_labels)
+        eval_end_time = time.monotonic()
+
+        diff = timedelta(seconds=eval_end_time - eval_start_time)
+        diff_seconds = diff.total_seconds()
+        result['eval_time'] = diff_seconds
+        result_to_text_file(result, os.path.join(out_dir, "test_results.txt"))
+
+        y_pred = np.argmax(y_logits, axis=1)
+        print('\n\t**** Classification report ****\n')
+        print(classification_report(test_labels.numpy(), y_pred, target_names=label_list))
+
+        report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True)
+        report['eval_time'] = diff_seconds
+        dictionary_to_json(report, os.path.join(out_dir, "test_results.json"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/TernaryBERT/gather_results.py b/TernaryBERT/gather_results.py
new file mode 100644
index 00000000..dd925b15
--- /dev/null
+++ b/TernaryBERT/gather_results.py
@@ -0,0 +1,102 @@
+import argparse
+import json
+import os
+from typing import Any, Dict
+
+import pandas as pd
+
+from transformer import BertConfig
+from transformer.modeling_quant import BertForSequenceClassification as QuantBertForSequenceClassification
+from utils_multiemo import MultiemoProcessor
+
+PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__))
+DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data')
+MODELS_FOLDER = os.path.join(DATA_FOLDER, 'models', 'bert-of-theseus')
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task_name",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The name of the task to train.")
+
+    args = parser.parse_args()
+    task_name = args.task_name
+
+    models_subdirectories = get_immediate_subdirectories(MODELS_FOLDER)
+    print(MODELS_FOLDER)
+
+    print(models_subdirectories)
+    data = list()
+    for subdirectory in models_subdirectories:
+        data_dict = gather_results(subdirectory, task_name)
+        data.append(data_dict)
+
+    df = pd.DataFrame(data)
+    cols = df.columns.tolist()
+    cols = cols[-1:] + cols[:-1]
+    df = df[cols]
+    df.to_csv(os.path.join(DATA_FOLDER, 'results-ternarybert-' + task_name + '.csv'), index=False)
+
+
+def get_immediate_subdirectories(a_dir):
+    return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
+            if os.path.isdir(os.path.join(a_dir, name))]
+
+
+def gather_results(model_dir: str, task_name: str) -> Dict[str, Any]:
+    quant_model_dir = os.path.join(model_dir, 'quant')
+
+    with open(os.path.join(quant_model_dir, 'training_params.json')) as json_file:
+        training_data_dict = json.load(json_file)
+
+    with open(os.path.join(quant_model_dir, 'test_results.json')) as json_file:
+        test_data = json.load(json_file)
+        [test_data_dict] = pd.json_normalize(test_data, sep='_').to_dict(orient='records')
+
+    data = training_data_dict.copy()  # start with keys and values of x
+    data.update(test_data_dict)
+
+    model_size = os.path.getsize(os.path.join(quant_model_dir, 'pytorch_model.bin'))
+    data['model_size'] = model_size
+
+    if 'multiemo' not in task_name:
+        raise ValueError("Task not found: %s" % task_name)
+
+    _, lang, domain, kind = task_name.split('_')
+    processor = MultiemoProcessor(lang, domain, kind)
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # LOADING THE BEST MODEL
+    student_config = BertConfig.from_pretrained(
+        quant_model_dir,
+        quantize_act=True,
+        weight_bits=data['weight_bits'],
+        input_bits=data['input_bits'],
+        clip_val=data['clip_val']
+    )
+    model = QuantBertForSequenceClassification.from_pretrained(quant_model_dir, config=student_config,
+                                                               num_labels=num_labels)
+
+    memory_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
+    memory_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])
+    memory_used = memory_params + memory_buffers  # in bytes
+
+    data['memory'] = memory_used
+
+    parameters_num = 0
+    for n, p in model.named_parameters():
+        parameters_num += p.nelement()
+
+    data['parameters'] = parameters_num
+    data['name'] = os.path.basename(model_dir)
+    print(data)
+
+    return data
+
+
+if __name__ == '__main__':
+    main()
diff --git a/TernaryBERT/quant_task_multiemo.py b/TernaryBERT/quant_task_multiemo.py
index 4a0c43b7..1726b13f 100644
--- a/TernaryBERT/quant_task_multiemo.py
+++ b/TernaryBERT/quant_task_multiemo.py
@@ -435,28 +435,49 @@ def main():
     test_sampler = SequentialSampler(test_data)
     test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.batch_size)
 
-    logger.info("\n***** Running evaluation on test dataset *****")
-    logger.info("  Num examples = %d", len(test_features))
-    logger.info("  Batch size = %d", args.batch_size)
-
-    eval_start_time = time.monotonic()
-    student_model.eval()
-    result, y_logits = do_eval(student_model, task_name, test_dataloader,
-                               device, output_mode, test_labels, num_labels)
-    eval_end_time = time.monotonic()
-
-    diff = timedelta(seconds=eval_end_time - eval_start_time)
-    diff_seconds = diff.total_seconds()
-    result['eval_time'] = diff_seconds
-    result_to_text_file(result, os.path.join(output_dir, "test_results.txt"))
-
-    y_pred = np.argmax(y_logits, axis=1)
-    print('\n\t**** Classification report ****\n')
-    print(classification_report(test_labels.numpy(), y_pred, target_names=label_list))
+    config = BertConfig.from_pretrained(
+        output_dir,
+        quantize_act=True,
+        weight_bits=args.weight_bits,
+        input_bits=args.input_bits,
+        clip_val=args.clip_val
+    )
+    model = QuantBertForSequenceClassification.from_pretrained(output_dir, config=config, num_labels=num_labels)
 
-    report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True)
-    report['eval_time'] = diff_seconds
-    dictionary_to_json(report, os.path.join(output_dir, "test_results.json"))
+    output_quant_dir = os.path.join(output_dir, 'quant')
+    qunat_config = BertConfig.from_pretrained(
+        output_quant_dir,
+        quantize_act=True,
+        weight_bits=args.weight_bits,
+        input_bits=args.input_bits,
+        clip_val=args.clip_val
+    )
+    quant_model = QuantBertForSequenceClassification.from_pretrained(output_quant_dir, config=qunat_config,
+                                                                     num_labels=num_labels)
+
+    for m, out_dir in zip([model, quant_model], [output_dir, output_quant_dir]):
+        logger.info("\n***** Running evaluation on test dataset *****")
+        logger.info("  Num examples = %d", len(test_features))
+        logger.info("  Batch size = %d", args.batch_size)
+
+        eval_start_time = time.monotonic()
+        m.eval()
+        result, y_logits = do_eval(m, task_name, test_dataloader,
+                                   device, output_mode, test_labels, num_labels)
+        eval_end_time = time.monotonic()
+
+        diff = timedelta(seconds=eval_end_time - eval_start_time)
+        diff_seconds = diff.total_seconds()
+        result['eval_time'] = diff_seconds
+        result_to_text_file(result, os.path.join(out_dir, "test_results.txt"))
+
+        y_pred = np.argmax(y_logits, axis=1)
+        print('\n\t**** Classification report ****\n')
+        print(classification_report(test_labels.numpy(), y_pred, target_names=label_list))
+
+        report = classification_report(test_labels.numpy(), y_pred, target_names=label_list, output_dict=True)
+        report['eval_time'] = diff_seconds
+        dictionary_to_json(report, os.path.join(out_dir, "test_results.json"))
 
 
 def get_optimizer(args, num_train_optimization_steps, student_model):
diff --git a/TernaryBERT/run_experiments.py b/TernaryBERT/run_experiments.py
index 0c0882f4..e669a3bb 100644
--- a/TernaryBERT/run_experiments.py
+++ b/TernaryBERT/run_experiments.py
@@ -17,6 +17,8 @@
 learning_rate = 5e-5
 weight_decay = 0.01
 
+evaluate = False
+
 
 def main():
     print(PROJECT_FOLDER)
@@ -72,6 +74,21 @@ def main():
     logger.info(f"Training ternarybert for multiemo_en_all_sentence")
     run_process(cmd)
 
+    if evaluate:
+        cmd = 'python3 eval_quant_multiemo.py '
+        options = [
+            '--data_dir', 'data/multiemo2',
+            '--model_dir ', 'data/models/ternarybert',
+            '--task_name', 'multiemo_en_all_sentence',
+            '--output_dir', 'data/models/ternarybert'
+            '--weight_bits', str(2),
+            '--input_bits', str(8),
+            '--do_lower_case'
+        ]
+        cmd += ' '.join(options)
+        logger.info(f"Evaluating ternarybert for multiemo_en_all_sentence")
+        run_process(cmd)
+
     # cmd = f'python3 -m gather_results --task_name multiemo_en_all_sentence'
     # logger.info(f"Gathering results to csv for multiemo_en_all_sentence")
     # run_process(cmd)

From 7b592794805e606bb224e9a3aeebd33fa2079b3a Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Thu, 9 Dec 2021 14:49:35 +0100
Subject: [PATCH 54/62] feat: correct dir in gathering result script

---
 TernaryBERT/gather_results.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TernaryBERT/gather_results.py b/TernaryBERT/gather_results.py
index dd925b15..9f4d96c6 100644
--- a/TernaryBERT/gather_results.py
+++ b/TernaryBERT/gather_results.py
@@ -11,7 +11,7 @@
 
 PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__))
 DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data')
-MODELS_FOLDER = os.path.join(DATA_FOLDER, 'models', 'bert-of-theseus')
+MODELS_FOLDER = os.path.join(DATA_FOLDER, 'models', 'ternarybert')
 
 
 def main():
@@ -49,7 +49,7 @@ def get_immediate_subdirectories(a_dir):
 def gather_results(model_dir: str, task_name: str) -> Dict[str, Any]:
     quant_model_dir = os.path.join(model_dir, 'quant')
 
-    with open(os.path.join(quant_model_dir, 'training_params.json')) as json_file:
+    with open(os.path.join(model_dir, 'training_params.json')) as json_file:
         training_data_dict = json.load(json_file)
 
     with open(os.path.join(quant_model_dir, 'test_results.json')) as json_file:

From 0c82cc6c0a441f1ff52d1a18227c508c01bde6be Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Thu, 9 Dec 2021 14:50:15 +0100
Subject: [PATCH 55/62] add results

---
 .../data/results-ternarybert-multiemo_en_all_sentence.csv       | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv

diff --git a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv
new file mode 100644
index 00000000..0ed4a317
--- /dev/null
+++ b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv
@@ -0,0 +1,2 @@
+name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters
+multiemo_en_all_sentence,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316

From 4fec476b657fb8455574bb8c6817b261d4b03d18 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Thu, 9 Dec 2021 15:16:01 +0100
Subject: [PATCH 56/62] feat: add adding results for DynaBERT

---
 DynaBERT/gather_results.py | 112 +++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 DynaBERT/gather_results.py

diff --git a/DynaBERT/gather_results.py b/DynaBERT/gather_results.py
new file mode 100644
index 00000000..3f1a7ad4
--- /dev/null
+++ b/DynaBERT/gather_results.py
@@ -0,0 +1,112 @@
+import argparse
+import json
+import os
+from typing import Any, Dict
+
+import pandas as pd
+
+from transformers import BertConfig, BertForSequenceClassification
+
+from transformers.data.processors.multiemo import MultiemoProcessor
+
+PROJECT_FOLDER = os.path.dirname(os.path.abspath(__file__))
+DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data')
+MODELS_FOLDER = os.path.join(DATA_FOLDER, 'models')
+DYNABERT_FT_MODELS_FOLDER = os.path.join(MODELS_FOLDER, 'dynabert-finetuned')
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task_name",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The name of the task to train.")
+    parser.add_argument('--depth_mult', type=str, default='1.',
+                        help="the possible depths used for training, e.g., '1.' is for default")
+    parser.add_argument('--width_mult', type=str, default='1.',
+                        help="the possible depths used for training, e.g., '1.' is for default")
+
+    args = parser.parse_args()
+    task_name = args.task_name
+
+    models_subdirectories = get_immediate_subdirectories(DYNABERT_FT_MODELS_FOLDER)
+    print(DYNABERT_FT_MODELS_FOLDER)
+    print(models_subdirectories)
+
+    data = list()
+    for subdirectory in models_subdirectories:
+        data_dict = gather_results(subdirectory, task_name, args.depth_mult, args.width_mult)
+        data.append(data_dict)
+
+    df = pd.DataFrame(data)
+    cols = df.columns.tolist()
+    cols = cols[-2:] + cols[:-2]
+    df = df[cols]
+    df.to_csv(os.path.join(DATA_FOLDER, 'results-dynabert-' + task_name + '.csv'), index=False)
+
+
+def get_immediate_subdirectories(a_dir):
+    return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
+            if os.path.isdir(os.path.join(a_dir, name))]
+
+
+def gather_results(model_dir: str, task_name: str, depth_mult: float, width_mult: float) -> Dict[str, Any]:
+    task_subfolder = os.path.basename(model_dir)
+    eval_output_dir = os.path.join(
+        model_dir, 'bert_' + str(width_mult) + '_' + str(depth_mult) + '_eval')
+
+    with open(os.path.join(model_dir, 'training_params.json')) as json_file:
+        training_data_dict = json.load(json_file)
+
+    with open(os.path.join(eval_output_dir, 'test_results.json')) as json_file:
+        test_data = json.load(json_file)
+        [test_data_dict] = pd.json_normalize(test_data, sep='_').to_dict(orient='records')
+
+    data = training_data_dict.copy()  # start with keys and values of x
+    data.update(test_data_dict)
+
+    with open(os.path.join(MODELS_FOLDER, 'dynabertw', task_subfolder, 'training_params.json')) as json_file:
+        dynabertw_training_data_dict = json.load(json_file)
+        data['training_time'] = data['training_time'] + dynabertw_training_data_dict['training_time']
+
+    with open(os.path.join(MODELS_FOLDER, 'dynabert', task_subfolder, 'training_params.json')) as json_file:
+        dynabert_training_data_dict = json.load(json_file)
+        data['training_time'] = data['training_time'] + dynabert_training_data_dict['training_time']
+
+    model_size = os.path.getsize(os.path.join(model_dir, 'pytorch_model.bin'))
+    data['model_size'] = model_size
+
+    if 'multiemo' not in task_name:
+        raise ValueError("Task not found: %s" % task_name)
+
+    _, lang, domain, kind = task_name.split('_')
+    processor = MultiemoProcessor(lang, domain, kind)
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # LOADING THE BEST MODEL
+    config = BertConfig.from_pretrained(model_dir, num_labels=num_labels, finetuning_task=task_name)
+    model = BertForSequenceClassification.from_pretrained(model_dir, config=config)
+    model.apply(lambda m: setattr(m, 'depth_mult', float(depth_mult)))
+    model.apply(lambda m: setattr(m, 'width_mult', float(width_mult)))
+
+    memory_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
+    memory_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])
+    memory_used = memory_params + memory_buffers  # in bytes
+
+    data['memory'] = memory_used
+
+    parameters_num = 0
+    for n, p in model.named_parameters():
+        parameters_num += p.nelement()
+
+    data['parameters'] = parameters_num
+    data['name'] = os.path.basename(model_dir)
+    data['model_name'] = 'dynabert_d_' + str(depth_mult) + '_w_' + str(width_mult)
+    print(data)
+    return data
+
+
+if __name__ == '__main__':
+    main()

From d8290b59e63b95b4f32a475abf0d4afc742ac3b1 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Thu, 9 Dec 2021 15:21:15 +0100
Subject: [PATCH 57/62] Add DynaBERT results

---
 DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv

diff --git a/DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv b/DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv
new file mode 100644
index 00000000..da4102b1
--- /dev/null
+++ b/DynaBERT/data/results-dynabert-multiemo_en_all_sentence.csv
@@ -0,0 +1,2 @@
+name,model_name,data_dir,model_dir,output_dir,model_type,task_name,max_seq_length,do_train,evaluate_during_training,do_lower_case,per_gpu_train_batch_size,per_gpu_eval_batch_size,gradient_accumulation_steps,learning_rate,weight_decay,num_train_epochs,warmup_steps,seed,hidden_dropout_prob,attention_probs_dropout_prob,data_aug,depth_mult_list,depth_lambda1,depth_lambda2,width_mult_list,width_lambda1,width_lambda2,training_phase,n_gpu,output_mode,train_batch_size,eval_batch_size,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters
+multiemo_en_all_sentence,dynabert_d_0.5_w_1.,data/multiemo2,data/models/dynabertw/multiemo_en_all_sentence,data/models/dynabert-finetuned/multiemo_en_all_sentence,bert,multiemo_en_all_sentence,128,True,True,True,16,8,1,5e-05,0.01,3.0,0,42,0.1,0.1,False,"[0.5, 0.75, 1.0]",1.0,1.0,"[0.25, 0.5, 0.75, 1.0]",1.0,0.1,final_finetuning,1,classification,16,8,34809.954543,0.7724978241949522,14.607117,0.5470219435736677,0.5124816446402349,0.5291887793783169,681,0.7953004970628107,0.8290155440414507,0.8118081180811808,2123,0.8104786545924968,0.8232588699080158,0.8168187744458931,1522,0.798219584569733,0.7582804792107117,0.7777376219732562,1419,0.737755169949677,0.7307591344501033,0.7338883234696617,5745,0.770612184792384,0.7724978241949522,0.771219156436846,5745,438020911,437941264,109485316

From f8df472777d9c5de328d61828d210baecc2f9658 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Thu, 9 Dec 2021 15:59:21 +0100
Subject: [PATCH 58/62] feat: add model name to results

---
 TernaryBERT/gather_results.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TernaryBERT/gather_results.py b/TernaryBERT/gather_results.py
index 9f4d96c6..68ca41bc 100644
--- a/TernaryBERT/gather_results.py
+++ b/TernaryBERT/gather_results.py
@@ -93,6 +93,7 @@ def gather_results(model_dir: str, task_name: str) -> Dict[str, Any]:
 
     data['parameters'] = parameters_num
     data['name'] = os.path.basename(model_dir)
+    data['model_name'] = 'TernaryBERT'
     print(data)
 
     return data

From a983e54bcf7be80fcb365f548a72e706733630b7 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Thu, 9 Dec 2021 15:59:35 +0100
Subject: [PATCH 59/62] Update results

---
 .../data/results-ternarybert-multiemo_en_all_sentence.csv     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv
index 0ed4a317..cc0a090d 100644
--- a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv
+++ b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv
@@ -1,2 +1,2 @@
-name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters
-multiemo_en_all_sentence,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316
+model_name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters,name
+TernaryBERT,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316,multiemo_en_all_sentence

From 71cd4a5ac513ea6a57c5398eff1aecf031689398 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Thu, 9 Dec 2021 16:03:12 +0100
Subject: [PATCH 60/62] feat: minor correct

---
 TernaryBERT/gather_results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TernaryBERT/gather_results.py b/TernaryBERT/gather_results.py
index 68ca41bc..7953a52b 100644
--- a/TernaryBERT/gather_results.py
+++ b/TernaryBERT/gather_results.py
@@ -36,7 +36,7 @@ def main():
 
     df = pd.DataFrame(data)
     cols = df.columns.tolist()
-    cols = cols[-1:] + cols[:-1]
+    cols = cols[-2:] + cols[:-2]
     df = df[cols]
     df.to_csv(os.path.join(DATA_FOLDER, 'results-ternarybert-' + task_name + '.csv'), index=False)
 

From 8173a5c39f8d22075ca6e38fc82c02bd96556bc8 Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Thu, 9 Dec 2021 16:02:09 +0100
Subject: [PATCH 61/62] Update results 2

---
 .../data/results-ternarybert-multiemo_en_all_sentence.csv     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv
index cc0a090d..c0be3ad7 100644
--- a/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv
+++ b/TernaryBERT/data/results-ternarybert-multiemo_en_all_sentence.csv
@@ -1,2 +1,2 @@
-model_name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters,name
-TernaryBERT,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316,multiemo_en_all_sentence
+name,model_name,data_dir,model_dir,teacher_model,student_model,task_name,output_dir,do_lower_case,learning_rate,num_train_epochs,weight_decay,seed,aug_train,pred_distill,intermediate_distill,save_fp_model,save_quantized_model,weight_bits,input_bits,clip_val,batch_size,max_seq_length,training_time,accuracy,eval_time,z_amb_precision,z_amb_recall,z_amb_f1-score,z_amb_support,z_minus_m_precision,z_minus_m_recall,z_minus_m_f1-score,z_minus_m_support,z_plus_m_precision,z_plus_m_recall,z_plus_m_f1-score,z_plus_m_support,z_zero_precision,z_zero_recall,z_zero_f1-score,z_zero_support,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,model_size,memory,parameters
+multiemo_en_all_sentence,TernaryBERT,data/multiemo2,data/models/bert-base-uncased,data/models/bert-base-uncased/multiemo_en_all_sentence,data/models/bert-base-uncased/multiemo_en_all_sentence,multiemo_en_all_sentence,data/models/ternarybert,True,5e-05,3.0,0.01,42,False,True,True,True,True,2,8,2.5,16,128,5409.666556,0.785378590078329,73.592085,0.6198347107438017,0.44052863436123346,0.5150214592274678,681,0.7760372565622354,0.8634008478568064,0.817391304347826,2123,0.8234536082474226,0.8396846254927727,0.8314899154196487,1522,0.8173719376391982,0.7758985200845666,0.7960954446854664,1419,0.7591743782981645,0.7298781569488448,0.7399995309201022,5745,0.7802927249714108,0.785378590078329,0.7800240975007349,5745,438087017,437942824,109485316

From 7a977d6719226ac30fa4e3732baed037ed156c7c Mon Sep 17 00:00:00 2001
From: wojtek11530 <wojtek19962a32@gmail.com>
Date: Thu, 9 Dec 2021 16:31:48 +0100
Subject: [PATCH 62/62] feat: capitilize model name

---
 DynaBERT/gather_results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DynaBERT/gather_results.py b/DynaBERT/gather_results.py
index 3f1a7ad4..85544525 100644
--- a/DynaBERT/gather_results.py
+++ b/DynaBERT/gather_results.py
@@ -103,7 +103,7 @@ def gather_results(model_dir: str, task_name: str, depth_mult: float, width_mult
 
     data['parameters'] = parameters_num
     data['name'] = os.path.basename(model_dir)
-    data['model_name'] = 'dynabert_d_' + str(depth_mult) + '_w_' + str(width_mult)
+    data['model_name'] = 'Dynabert_d_' + str(depth_mult) + '_w_' + str(width_mult)
     print(data)
     return data