From a3e5e03683e35177c7ffd7390e777a49f9740a7a Mon Sep 17 00:00:00 2001 From: danaiksez Date: Thu, 14 Nov 2019 14:47:08 +0200 Subject: [PATCH 1/5] baseline rnn --- examples/therapy.py | 185 ++++++++++++++++ examples/therapy_title.py | 185 ++++++++++++++++ slp/data/collators_title.py | 75 +++++++ slp/data/therapy.py | 145 +++++++++++++ slp/data/therapy_title.py | 144 ++++++++++++ slp/modules/hier_att_net.py | 125 +++++++++++ slp/modules/hier_att_net_title.py | 136 ++++++++++++ slp/trainer/trainer.py | 11 +- slp/trainer/trainer_title.py | 350 ++++++++++++++++++++++++++++++ 9 files changed, 1355 insertions(+), 1 deletion(-) create mode 100644 examples/therapy.py create mode 100644 examples/therapy_title.py create mode 100644 slp/data/collators_title.py create mode 100644 slp/data/therapy.py create mode 100644 slp/data/therapy_title.py create mode 100644 slp/modules/hier_att_net.py create mode 100644 slp/modules/hier_att_net_title.py create mode 100644 slp/trainer/trainer_title.py diff --git a/examples/therapy.py b/examples/therapy.py new file mode 100644 index 0000000..8ceabf3 --- /dev/null +++ b/examples/therapy.py @@ -0,0 +1,185 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators import SequenceClassificationCollator +from slp.data.therapy import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.hier_att_net import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer import SequentialTrainer + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device=DEVICE) + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + sampler=val_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.001) + + metrics = { + 'accuracy': Accuracy(), + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir='../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 8 + batch_val = 8 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 8 + hidden_size = 300 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) +# to_tensor = ToTensor(device='cpu') + to_tensor = ToTensor(device=DEVICE) + + bio = PsychologicalDataset( + '../data/balanced_new_csv.csv', '../data/psychotherapy/', + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + +# train_loader, val_loader = train_test_split(bio, batch_train, batch_val, test_size=.2) + + + #model = HierAttNet( + # hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + #model = model.to(DEVICE) + #criterion = nn.CrossEntropyLoss() + #optimizer = Adam(model.parameters(), lr=0.001) + + #metrics = { + # 'accuracy': Accuracy(), + # 'loss': Loss(criterion) + #} + + + #trainer = SequentialTrainer( +# model, +# optimizer, +# checkpoint_dir='../checkpoints' if not DEBUG else None, +# metrics=metrics, +# non_blocking=True, +# patience=10, +# loss_fn=criterion, +# device=DEVICE) + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): + trainer = trainer_factory(embeddings, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/examples/therapy_title.py b/examples/therapy_title.py new file mode 100644 index 0000000..33f782d --- /dev/null +++ b/examples/therapy_title.py @@ -0,0 +1,185 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators_title import SequenceClassificationCollator +from slp.data.therapy_title import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.hier_att_net_title import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer.trainer_title import SequentialTrainer + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device=DEVICE) + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + sampler=val_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.0005) + + metrics = { + 'accuracy': Accuracy(), + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir='../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 8 + batch_val = 8 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 8 + hidden_size = 300 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) +# to_tensor = ToTensor(device='cpu') + to_tensor = ToTensor(device=DEVICE) + + bio = PsychologicalDataset( + '../data/balanced_new_csv.csv', '../data/psychotherapy/', + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + +# train_loader, val_loader = train_test_split(bio, batch_train, batch_val, test_size=.2) + + + #model = HierAttNet( + # hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + #model = model.to(DEVICE) + #criterion = nn.CrossEntropyLoss() + #optimizer = Adam(model.parameters(), lr=0.001) + + #metrics = { + # 'accuracy': Accuracy(), + # 'loss': Loss(criterion) + #} + + + #trainer = SequentialTrainer( +# model, +# optimizer, +# checkpoint_dir='../checkpoints' if not DEBUG else None, +# metrics=metrics, +# non_blocking=True, +# patience=10, +# loss_fn=criterion, +# device=DEVICE) + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): + trainer = trainer_factory(embeddings, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/slp/data/collators_title.py b/slp/data/collators_title.py new file mode 100644 index 0000000..296cac8 --- /dev/null +++ b/slp/data/collators_title.py @@ -0,0 +1,75 @@ +import torch +from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence + +from slp.modules.util import pad_mask, subsequent_mask +from slp.util import mktensor + + +class SequenceClassificationCollator(object): + def __init__(self, pad_indx=0, device='cpu'): + self.pad_indx = pad_indx + self.device = device + + def __call__(self, batch): + inputs, titles, targets = map(list, zip(*batch)) + lengths = torch.tensor([len(s) for s in inputs], device=self.device) + # Pad and convert to tensor + inputs = (pad_sequence(inputs, + batch_first=True, + padding_value=self.pad_indx) + .to(self.device)) + lengths_title = torch.tensor([len(t) for t in titles], device=self.device) + titles = (pad_sequence(titles, + batch_first=True, + padding_value=self.pad_indx) + .to(self.device)) + + targets = mktensor(targets, device=self.device, dtype=torch.long) + return inputs, titles, targets.to(self.device), lengths, lengths_title + + +class TransformerCollator(object): + def __init__(self, pad_indx=0, device='cpu'): + self.pad_indx = pad_indx + self.device = device + + def pad_and_mask(self, tensors): + lengths = torch.tensor([len(s) for s in tensors], + device=self.device) + max_length = torch.max(lengths) + pad_m = pad_mask(lengths, max_length=max_length, device=self.device) + sub_m = subsequent_mask(max_length) + tensors = (pad_sequence(tensors, + batch_first=True, + padding_value=self.pad_indx) + .to(self.device)) + return tensors, pad_m, sub_m + + @staticmethod + def get_inputs_and_targets(batch): + inputs, targets = map(list, zip(*batch)) + return inputs, targets + + def __call__(self, batch): + inputs, targets = self.get_inputs_and_targets(batch) + inputs, pad_m_inputs, _ = self.pad_and_mask(inputs) + targets, pad_m_targets, sub_m = self.pad_and_mask(targets) + mask_targets = pad_m_targets.unsqueeze(-2) * sub_m + mask_inputs = pad_m_inputs.unsqueeze(-2) + return inputs, targets, mask_inputs, mask_targets + + +class PackedSequenceCollator(object): + def __init__(self, pad_indx=0, device='cpu', batch_first=True): + self.seq_collator = SequenceClassificationCollator( + pad_indx=pad_indx, device=device) + self.batch_first = batch_first + self.device = device + + def __call__(self, batch): + inputs, targets, lengths = self.seq_collator(batch) + inputs = pack_padded_sequence( + inputs, lengths, + batch_first=self.batch_first, + enforce_sorted=False) + return inputs, targets.to(self.device), lengths[inputs.sorted_indices] diff --git a/slp/data/therapy.py b/slp/data/therapy.py new file mode 100644 index 0000000..16ba880 --- /dev/null +++ b/slp/data/therapy.py @@ -0,0 +1,145 @@ +import pandas as pd +import os +import csv + +from html.parser import HTMLParser +from sklearn.preprocessing import MultiLabelBinarizer +from torch.utils.data import Dataset + + +def pad_sequence(sequences, batch_first=False, padding_len=None, padding_value=0): + # assuming trailing dimensions and type of all the Tensors + # in sequences are same and fetching those from sequences[0] +# import pdb; pdb.set_trace() + max_size = sequences[0].size() + + trailing_dims = max_size[1:] + if padding_len is not None: + max_len = padding_len + else: + max_len = max([s.size(0) for s in sequences]) + if batch_first: + out_dims = (len(sequences), max_len) + trailing_dims + else: + out_dims = (max_len, len(sequences)) + trailing_dims + + out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value) + for i, tensor in enumerate(sequences): + if tensor.size(0) > padding_len: + tensor = tensor[:padding_len] + length = min(tensor.size(0), padding_len) + # use index notation to prevent duplicate references to the tensor + if batch_first: + out_tensor[i, :length, ...] = tensor + else: + out_tensor[:length, i, ...] = tensor + return out_tensor + + + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.strict = False + self.convert_charrefs= True + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def get_data(self): + return ''.join(self.fed) + +def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() + + +class TupleDataset(Dataset): + def __init__(self, data): + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + sample, label = self.data[idx] + return sample, label + + +class PsychologicalDataset(Dataset): + def __init__(self, csv_file, root_dir, max_word_len=25, text_transforms=None): + self.file = pd.read_csv(csv_file) + self.root_dir = root_dir + self.max_word_len = max_word_len + self.text_transforms = text_transforms + self.transcript, self.label, self.metadata, self.title = self.get_files_labels_metadata(self.root_dir, self.file) + + + def get_files_labels_metadata(self, root_dir, _file): + included_cols = [1,11,12,13,14,15,16,20,22] + #included_cols_names = ['file name', 'session title', 'client gender', + # 'client age', 'client marital status', + # 'client sexual orientation', 'therapist gender', + # 'therapist experience', 'psych. subject', 'therapies'] + + transcripts = [] + labels = [] + title = [] + metadata = [] + + rows = [] + _file = "../data/balanced_new_csv.csv" + with open(_file) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + rows.append(list(row)) + + for i in range(len(rows)): + labels.append(rows[i][21]) + title.append(rows[i][5]) + metadata.append([rows[i][y] for y in included_cols]) + + for i in range(len(rows)): + f = int(float(rows[i][1])) + filename = str(os.path.join(self.root_dir, str(f))) + '.txt' + fp = open(filename,'r+') + transcript = fp.read() + transcripts.append(transcript) + + return transcripts, labels, metadata, title + + + def __len__(self): + return len(self.label) + + + def __getitem__(self, idx): + preprocessed_text = self.transcript[idx] + label = self.label[idx].split("; ") + title = self.title[idx] + + metadata = self.metadata[idx] +# import pdb; pdb.set_trace() + if self.text_transforms is not None: + lista = [] + turns = [] + p = strip_tags(preprocessed_text) + p = p.split("\n") + + p1 = [x for x in p if x!=''] + p2 = [(x+ ' '+ y) for x,y in zip(p1[0::2], p1[1::2])] + + for i in p2: + i = i.split(":") + if len(i) is not 1: + turns.append(i[0]) + lista.append(self.text_transforms(i[1])) + +# padding_len = len(max(lista, key=len)) +# preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=padding_len) + preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=self.max_word_len) + + + lab = int("Depression (emotion)" in label) + return (preprocessed_text, lab) + diff --git a/slp/data/therapy_title.py b/slp/data/therapy_title.py new file mode 100644 index 0000000..caae0ee --- /dev/null +++ b/slp/data/therapy_title.py @@ -0,0 +1,144 @@ +import pandas as pd +import os +import csv + +from html.parser import HTMLParser +from sklearn.preprocessing import MultiLabelBinarizer +from torch.utils.data import Dataset + + +def pad_sequence(sequences, batch_first=False, padding_len=None, padding_value=0): + # assuming trailing dimensions and type of all the Tensors + # in sequences are same and fetching those from sequences[0] +# import pdb; pdb.set_trace() + max_size = sequences[0].size() + + trailing_dims = max_size[1:] + if padding_len is not None: + max_len = padding_len + else: + max_len = max([s.size(0) for s in sequences]) + if batch_first: + out_dims = (len(sequences), max_len) + trailing_dims + else: + out_dims = (max_len, len(sequences)) + trailing_dims + + out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value) + for i, tensor in enumerate(sequences): + if tensor.size(0) > padding_len: + tensor = tensor[:padding_len] + length = min(tensor.size(0), padding_len) + # use index notation to prevent duplicate references to the tensor + if batch_first: + out_tensor[i, :length, ...] = tensor + else: + out_tensor[:length, i, ...] = tensor + return out_tensor + + + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.strict = False + self.convert_charrefs= True + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def get_data(self): + return ''.join(self.fed) + +def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() + + +class TupleDataset(Dataset): + def __init__(self, data): + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + sample, label = self.data[idx] + return sample, label + + +class PsychologicalDataset(Dataset): + def __init__(self, csv_file, root_dir, max_word_len=25, text_transforms=None): + self.file = pd.read_csv(csv_file) + self.root_dir = root_dir + self.max_word_len = max_word_len + self.text_transforms = text_transforms + self.transcript, self.label, self.metadata, self.title = self.get_files_labels_metadata(self.root_dir, self.file) + + + def get_files_labels_metadata(self, root_dir, _file): + included_cols = [1,11,12,13,14,15,16,20,22] + #included_cols_names = ['file name', 'session title', 'client gender', + # 'client age', 'client marital status', + # 'client sexual orientation', 'therapist gender', + # 'therapist experience', 'psych. subject', 'therapies'] + + transcripts = [] + labels = [] + title = [] + metadata = [] + + rows = [] + _file = "../data/balanced_new_csv.csv" + with open(_file) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + rows.append(list(row)) + + for i in range(len(rows)): + labels.append(rows[i][21]) + title.append(rows[i][5]) + metadata.append([rows[i][y] for y in included_cols]) + + for i in range(len(rows)): + f = int(float(rows[i][1])) + filename = str(os.path.join(self.root_dir, str(f))) + '.txt' + fp = open(filename,'r+') + transcript = fp.read() + transcripts.append(transcript) + + return transcripts, labels, metadata, title + + + def __len__(self): + return len(self.label) + + + def __getitem__(self, idx): + preprocessed_text = self.transcript[idx] + label = self.label[idx].split("; ") + title = self.title[idx] + metadata = self.metadata[idx] + + if self.text_transforms is not None: + lista = [] + turns = [] + p = strip_tags(preprocessed_text) + p = p.split("\n") + + p1 = [x for x in p if x!=''] + p2 = [(x+ ' '+ y) for x,y in zip(p1[0::2], p1[1::2])] + + for i in p2: + i = i.split(":") + if len(i) is not 1: + turns.append(i[0]) + lista.append(self.text_transforms(i[1])) + +# padding_len = len(max(lista, key=len)) +# preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=padding_len) + preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=self.max_word_len) + preprocessed_title = self.text_transforms(title) + + lab = int("Depression (emotion)" in label) + return (preprocessed_text, preprocessed_title, lab) + diff --git a/slp/modules/hier_att_net.py b/slp/modules/hier_att_net.py new file mode 100644 index 0000000..9020e6d --- /dev/null +++ b/slp/modules/hier_att_net.py @@ -0,0 +1,125 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, hidden_size=300): + super(WordAttNet, self).__init__() + + self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) + + self.word = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): + output = self.lookup(inputs) + + output, lengths = self.pack(output,lengths) + f_output, h_output = self.gru(output.float(), hidden_state) + f_output = self.unpack(f_output, lengths) + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self,hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size, 300, bidirectional=True, batch_first=True) + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + + output = self.sent(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + output = self.fc(output).squeeze() + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + + self.sent_att_net = SentAttNet(self.hidden_size, num_classes) + self.word_att_net_text = WordAttNet(dict_size, diction, hidden_size) + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, lengths): + # inputs = (B, S, W) + output_list_text = [] + text = inputs.permute(1, 0, 2) + for i in text: + + word_lengths = i.size(1) - (i==0).sum(dim=1) + if 0 in word_lengths: + for k in range(0, inputs.size()[0]): + if word_lengths[k] == 0: + word_lengths[k] = 1 + + output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, self.word_hidden_state) #[8,600] + output_list_text.append(output_text) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + # output_list_text = (S, B, 600) + + output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, lengths, self.sent_hidden_state) + self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/modules/hier_att_net_title.py b/slp/modules/hier_att_net_title.py new file mode 100644 index 0000000..dec0220 --- /dev/null +++ b/slp/modules/hier_att_net_title.py @@ -0,0 +1,136 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, hidden_size=300): + super(WordAttNet, self).__init__() + + self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) + + self.word = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): + output = self.lookup(inputs) + + output, lengths = self.pack(output,lengths) + f_output, h_output = self.gru(output.float(), hidden_state) + f_output = self.unpack(f_output, lengths) + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self,hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size, 300, bidirectional=True, batch_first=True) + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, titles, hidden_state): + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + + + titles = torch.unsqueeze(titles, dim=1) + f_output = torch.cat((f_output,titles), dim=1) + + + output = self.sent(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + output = self.fc(output).squeeze() + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + + self.sent_att_net = SentAttNet(self.hidden_size, num_classes) + self.word_att_net_text = WordAttNet(dict_size, diction, hidden_size) + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, lengths, titles, title_lengths): + # inputs = (B, S, W) + output_list_text = [] + text = inputs.permute(1, 0, 2) + for i in text: + + word_lengths = i.size(1) - (i==0).sum(dim=1) + if 0 in word_lengths: + for k in range(0, inputs.size()[0]): + if word_lengths[k] == 0: + word_lengths[k] = 1 + + output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, self.word_hidden_state) #[8,600] + output_list_text.append(output_text) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + output_title, self.word_hidden_state = self.word_att_net_text(titles, title_lengths, self.word_hidden_state) #[8,600] + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + + + + # output_list_text = (S, B, 600) + + output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, lengths, output_title, self.sent_hidden_state) + self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/trainer/trainer.py b/slp/trainer/trainer.py index d3d517b..48bddfa 100644 --- a/slp/trainer/trainer.py +++ b/slp/trainer/trainer.py @@ -154,8 +154,9 @@ def train_step(self: TrainerType, engine: Engine, batch: List[torch.Tensor]) -> float: self.model.train() + #import pdb; pdb.set_trace() y_pred, targets = self.get_predictions_and_targets(batch) - loss = self.loss_fn(y_pred, targets) # type: ignore + loss = self.loss_fn(y_pred, targets.long()) # type: ignore if self.parallel: loss = loss.mean() loss = loss / self.accumulation_steps @@ -197,6 +198,13 @@ def fit(self: TrainerType, validation=True) self.model.zero_grad() self.trainer.run(train_loader, max_epochs=epochs) + best_score = (-self.early_stop.best_score + if self.early_stop + else self.valid_evaluator.state.metrics['loss']) + return best_score + + + def overfit_single_batch(self: TrainerType, train_loader: DataLoader) -> State: @@ -286,6 +294,7 @@ def get_predictions_and_targets( self, batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: inputs, targets, lengths = self.parse_batch(batch) + #import pdb; pdb.set_trace() y_pred = self.model(inputs, lengths) return y_pred, targets diff --git a/slp/trainer/trainer_title.py b/slp/trainer/trainer_title.py new file mode 100644 index 0000000..09e7d2a --- /dev/null +++ b/slp/trainer/trainer_title.py @@ -0,0 +1,350 @@ +import os +from typing import Union +import torch +import torch.nn as nn + +from ignite.handlers import EarlyStopping +from ignite.contrib.handlers import ProgressBar +from ignite.engine import Engine, Events, State +from ignite.metrics import RunningAverage, Loss + +from torch.optim.optimizer import Optimizer +from torch.nn.modules.loss import _Loss +from torch.utils.data import DataLoader + +from typing import cast, List, Optional, Tuple, TypeVar +from slp.util import types +from slp.util.parallel import DataParallelModel, DataParallelCriterion + +from slp.trainer.handlers import CheckpointHandler, EvaluationHandler +from slp.util import from_checkpoint, to_device +from slp.util import log +from slp.util import system + + +TrainerType = TypeVar('TrainerType', bound='Trainer') + + +class Trainer(object): + def __init__(self: TrainerType, + model: nn.Module, + optimizer: Optimizer, + checkpoint_dir: str = '../../checkpoints', + experiment_name: str = 'experiment', + model_checkpoint: Optional[str] = None, + optimizer_checkpoint: Optional[str] = None, + metrics: types.GenericDict = None, + patience: int = 10, + validate_every: int = 1, + accumulation_steps: int = 1, + loss_fn: Union[_Loss, DataParallelCriterion] = None, + non_blocking: bool = True, + retain_graph: bool = False, + dtype: torch.dtype = torch.float, + device: str = 'cpu', + parallel: bool = False) -> None: + self.dtype = dtype + self.retain_graph = retain_graph + self.non_blocking = non_blocking + self.device = device + self.loss_fn = loss_fn + self.validate_every = validate_every + self.patience = patience + self.accumulation_steps = accumulation_steps + self.checkpoint_dir = checkpoint_dir + + model_checkpoint = self._check_checkpoint(model_checkpoint) + optimizer_checkpoint = self._check_checkpoint(optimizer_checkpoint) + + self.model = cast(nn.Module, from_checkpoint( + model_checkpoint, model, map_location=torch.device('cpu'))) + self.model = self.model.type(dtype).to(device) + self.optimizer = from_checkpoint(optimizer_checkpoint, optimizer) + self.parallel = parallel + if parallel: + if device == 'cpu': + raise ValueError("parallel can be used only with cuda device") + self.model = DataParallelModel(self.model).to(device) + self.loss_fn = DataParallelCriterion(self.loss_fn) # type: ignore + if metrics is None: + metrics = {} + if 'loss' not in metrics: + if self.parallel: + metrics['loss'] = Loss( + lambda x, y: self.loss_fn(x, y).mean()) # type: ignore + else: + metrics['loss'] = Loss(self.loss_fn) + self.trainer = Engine(self.train_step) + self.train_evaluator = Engine(self.eval_step) + self.valid_evaluator = Engine(self.eval_step) + for name, metric in metrics.items(): + metric.attach(self.train_evaluator, name) + metric.attach(self.valid_evaluator, name) + + self.pbar = ProgressBar() + self.val_pbar = ProgressBar(desc='Validation') + + if checkpoint_dir is not None: + self.checkpoint = CheckpointHandler( + checkpoint_dir, experiment_name, score_name='validation_loss', + score_function=self._score_fn, n_saved=2, + require_empty=False, save_as_state_dict=True) + + self.early_stop = EarlyStopping( + patience, self._score_fn, self.trainer) + + self.val_handler = EvaluationHandler(pbar=self.pbar, + validate_every=1, + early_stopping=self.early_stop) + self.attach() + log.info( + f'Trainer configured to run {experiment_name}\n' + f'\tpretrained model: {model_checkpoint} {optimizer_checkpoint}\n' + f'\tcheckpoint directory: {checkpoint_dir}\n' + f'\tpatience: {patience}\n' + f'\taccumulation steps: {accumulation_steps}\n' + f'\tnon blocking: {non_blocking}\n' + f'\tretain graph: {retain_graph}\n' + f'\tdevice: {device}\n' + f'\tmodel dtype: {dtype}\n' + f'\tparallel: {parallel}') + + def _check_checkpoint(self: TrainerType, + ckpt: Optional[str]) -> Optional[str]: + if ckpt is None: + return ckpt + if system.is_url(ckpt): + ckpt = system.download_url(cast(str, ckpt), self.checkpoint_dir) + ckpt = os.path.join(self.checkpoint_dir, ckpt) + return ckpt + + @staticmethod + def _score_fn(engine: Engine) -> float: + """Returns the scoring metric for checkpointing and early stopping + + Args: + engine (ignite.engine.Engine): The engine that calculates + the val loss + + Returns: + (float): The validation loss + """ + negloss: float = -engine.state.metrics['loss'] + return negloss + + def parse_batch( + self: TrainerType, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + return inputs, targets + + def get_predictions_and_targets( + self: TrainerType, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, targets = self.parse_batch(batch) + y_pred = self.model(inputs) + return y_pred, targets + + def train_step(self: TrainerType, + engine: Engine, + batch: List[torch.Tensor]) -> float: + self.model.train() + #import pdb; pdb.set_trace() + y_pred, targets = self.get_predictions_and_targets(batch) + loss = self.loss_fn(y_pred, targets.long()) # type: ignore + if self.parallel: + loss = loss.mean() + loss = loss / self.accumulation_steps + loss.backward(retain_graph=self.retain_graph) + if (self.trainer.state.iteration + 1) % self.accumulation_steps == 0: + self.optimizer.step() # type: ignore + self.optimizer.zero_grad() + loss_value: float = loss.item() + return loss_value + + def eval_step( + self: TrainerType, + engine: Engine, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + self.model.eval() + with torch.no_grad(): + y_pred, targets = self.get_predictions_and_targets(batch) + return y_pred, targets + + def predict(self: TrainerType, dataloader: DataLoader) -> State: + return self.valid_evaluator.run(dataloader) + + def fit(self: TrainerType, + train_loader: DataLoader, + val_loader: DataLoader, + epochs: int = 50) -> State: + log.info( + 'Trainer will run for\n' + f'model: {self.model}\n' + f'optimizer: {self.optimizer}\n' + f'loss: {self.loss_fn}') + self.val_handler.attach(self.trainer, + self.train_evaluator, + train_loader, + validation=False) + self.val_handler.attach(self.trainer, + self.valid_evaluator, + val_loader, + validation=True) + self.model.zero_grad() + self.trainer.run(train_loader, max_epochs=epochs) + best_score = (-self.early_stop.best_score + if self.early_stop + else self.valid_evaluator.state.metrics['loss']) + return best_score + + + + + def overfit_single_batch(self: TrainerType, + train_loader: DataLoader) -> State: + single_batch = [next(iter(train_loader))] + + if self.trainer.has_event_handler(self.val_handler, Events.EPOCH_COMPLETED): + self.trainer.remove_event_handler(self.val_handler, Events.EPOCH_COMPLETED) + + self.val_handler.attach(self.trainer, + self.train_evaluator, + single_batch, # type: ignore + validation=False) + out = self.trainer.run(single_batch, max_epochs=100) + return out + + def fit_debug(self: TrainerType, + train_loader: DataLoader, + val_loader: DataLoader) -> State: + train_loader = iter(train_loader) + train_subset = [next(train_loader), next(train_loader)] + val_loader = iter(val_loader) # type: ignore + val_subset = [next(val_loader), next(val_loader)] # type ignore + out = self.fit(train_subset, val_subset, epochs=6) # type: ignore + return out + + def _attach_checkpoint(self: TrainerType) -> TrainerType: + ckpt = { + 'model': self.model, + 'optimizer': self.optimizer + } + if self.checkpoint_dir is not None: + self.valid_evaluator.add_event_handler( + Events.COMPLETED, self.checkpoint, ckpt) + return self + + + def attach(self: TrainerType) -> TrainerType: + ra = RunningAverage(output_transform=lambda x: x) + ra.attach(self.trainer, "Train Loss") + self.pbar.attach(self.trainer, ['Train Loss']) + self.val_pbar.attach(self.train_evaluator) + self.val_pbar.attach(self.valid_evaluator) + self.valid_evaluator.add_event_handler(Events.COMPLETED, + self.early_stop) + self = self._attach_checkpoint() + def graceful_exit(engine, e): + if isinstance(e, KeyboardInterrupt): + engine.terminate() + log.warn("CTRL-C caught. Exiting gracefully...") + else: + raise(e) + + self.trainer.add_event_handler(Events.EXCEPTION_RAISED, graceful_exit) + self.train_evaluator.add_event_handler(Events.EXCEPTION_RAISED, + graceful_exit) + self.valid_evaluator.add_event_handler(Events.EXCEPTION_RAISED, + graceful_exit) + return self + + +class AutoencoderTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + return inputs, inputs + + +class SequentialTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + titles = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + lengths = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + title_lengths = to_device(batch[4], + device=self.device, + non_blocking=self.non_blocking) + + return inputs, titles, targets, lengths, title_lengths + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, titles, targets, lengths, title_lengths = self.parse_batch(batch) + y_pred = self.model(inputs, lengths, titles, title_lengths) + return y_pred, targets + + +class Seq2seqTrainer(SequentialTrainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + lengths = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + return inputs, inputs, lengths + + +class TransformerTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + mask_inputs = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + mask_targets = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + return inputs, targets, mask_inputs, mask_targets + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, targets, mask_inputs, mask_targets = self.parse_batch(batch) + y_pred = self.model(inputs, + targets, + source_mask=mask_inputs, + target_mask=mask_targets) + targets = targets.view(-1) + y_pred = y_pred.view(targets.size(0), -1) + # TODO: BEAMSEARCH!! + return y_pred, targets From bfbedc46d2e7868a4d567bd7af0c2709039e18a2 Mon Sep 17 00:00:00 2001 From: danaiksez Date: Mon, 20 Jan 2020 15:07:25 +0200 Subject: [PATCH 2/5] lexicons --- examples/therapy_title.py | 38 +- examples/therapy_title_attentional_embed.py | 164 ++++++++ .../therapy_title_attentional_embed_on2.py | 160 ++++++++ slp/data/collators.py | 50 ++- slp/data/collators_title.py | 17 +- slp/data/therapy.py | 22 +- slp/data/therapy_title.py | 58 ++- slp/data/therapy_title_on2.py | 170 +++++++++ slp/data/transforms.py | 6 + slp/load_lexicons/get_BL_features.py | 69 ++++ slp/load_lexicons/get_afinn_features.py | 55 +++ slp/load_lexicons/get_all_6lexicons.py | 75 ++++ slp/load_lexicons/get_all_lexicons.py | 75 ++++ slp/load_lexicons/get_liwc_features.py | 99 +++++ slp/load_lexicons/get_mpqa_features.py | 63 +++ slp/load_lexicons/get_nrc_emolex_features.py | 27 ++ .../get_semeval2015_twitter_features.py | 18 + slp/modules/helpers.py | 6 +- slp/modules/hier_att_net_title.py | 32 +- .../hier_att_net_title_attentional_embed.py | 172 +++++++++ .../hier_att_net_title_attentional_embed2.py | 149 ++++++++ slp/trainer/trainer.py | 79 +++- slp/trainer/trainer_title.py | 1 + slp/trainer/trainer_title_no_validation.py | 359 ++++++++++++++++++ 24 files changed, 1879 insertions(+), 85 deletions(-) create mode 100644 examples/therapy_title_attentional_embed.py create mode 100644 examples/therapy_title_attentional_embed_on2.py create mode 100644 slp/data/therapy_title_on2.py create mode 100755 slp/load_lexicons/get_BL_features.py create mode 100755 slp/load_lexicons/get_afinn_features.py create mode 100644 slp/load_lexicons/get_all_6lexicons.py create mode 100644 slp/load_lexicons/get_all_lexicons.py create mode 100755 slp/load_lexicons/get_liwc_features.py create mode 100755 slp/load_lexicons/get_mpqa_features.py create mode 100755 slp/load_lexicons/get_nrc_emolex_features.py create mode 100755 slp/load_lexicons/get_semeval2015_twitter_features.py create mode 100644 slp/modules/hier_att_net_title_attentional_embed.py create mode 100644 slp/modules/hier_att_net_title_attentional_embed2.py create mode 100644 slp/trainer/trainer_title_no_validation.py diff --git a/examples/therapy_title.py b/examples/therapy_title.py index 33f782d..d9963ea 100644 --- a/examples/therapy_title.py +++ b/examples/therapy_title.py @@ -65,9 +65,9 @@ def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): for train_indices, val_indices in kfold.split(dataset): yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) -def trainer_factory(embeddings, device=DEVICE): +def trainer_factory(embeddings, idx2word, device=DEVICE): model = HierAttNet( - hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings, idx2word) model = model.to(DEVICE) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters(), lr=0.0005) @@ -100,7 +100,7 @@ def trainer_factory(embeddings, device=DEVICE): max_word_length = 150 #max length of each sentence (turn) - after padding num_classes = 2 batch_size = 8 - hidden_size = 300 + hidden_size = 380 epochs = 40 @@ -112,11 +112,11 @@ def trainer_factory(embeddings, device=DEVICE): tokenizer = SpacyTokenizer() replace_unknowns = ReplaceUnknownToken() to_token_ids = ToTokenIds(word2idx) -# to_tensor = ToTensor(device='cpu') to_tensor = ToTensor(device=DEVICE) bio = PsychologicalDataset( - '../data/balanced_new_csv.csv', '../data/psychotherapy/', + '../data/balanced_new_csv.csv', '../../../test_CEL/slp/data/psychotherapy/', + max_word_length, text_transforms = Compose([ tokenizer, replace_unknowns, @@ -124,37 +124,13 @@ def trainer_factory(embeddings, device=DEVICE): to_tensor])) -# train_loader, val_loader = train_test_split(bio, batch_train, batch_val, test_size=.2) - - - #model = HierAttNet( - # hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) - #model = model.to(DEVICE) - #criterion = nn.CrossEntropyLoss() - #optimizer = Adam(model.parameters(), lr=0.001) - - #metrics = { - # 'accuracy': Accuracy(), - # 'loss': Loss(criterion) - #} - - - #trainer = SequentialTrainer( -# model, -# optimizer, -# checkpoint_dir='../checkpoints' if not DEBUG else None, -# metrics=metrics, -# non_blocking=True, -# patience=10, -# loss_fn=criterion, -# device=DEVICE) if KFOLD: cv_scores = [] import gc for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): - trainer = trainer_factory(embeddings, device=DEVICE) + trainer = trainer_factory(embeddings, idx2word, device=DEVICE) fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) cv_scores.append(fold_score) print("**********************") @@ -165,7 +141,7 @@ def trainer_factory(embeddings, device=DEVICE): final_score = float(sum(cv_scores)) / len(cv_scores) else: train_loader, val_loader = train_test_split(bio, batch_train, batch_val) - trainer = trainer_factory(embeddings, device=DEVICE) + trainer = trainer_factory(embeddings, idx2word, device=DEVICE) final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) print(f'Final score: {final_score}') diff --git a/examples/therapy_title_attentional_embed.py b/examples/therapy_title_attentional_embed.py new file mode 100644 index 0000000..4049e6e --- /dev/null +++ b/examples/therapy_title_attentional_embed.py @@ -0,0 +1,164 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators_title import SequenceClassificationCollator +from slp.data.therapy_title import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.hier_att_net_title_attentional_embed import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer.trainer_title_no_validation import SequentialTrainer + +DEVICE = 'cpu' +#DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device='cpu') + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=False, + num_workers=0, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + num_workers=0, + sampler=val_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, idx2word, lex_size, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings, idx2word, lex_size) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.0005) + + metrics = { + 'accuracy': Accuracy(), + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir='../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 100 + batch_val = 100 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 100 + hidden_size = 300 + lex_size = 99 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) + to_tensor = ToTensor(device='cpu') + + bio = PsychologicalDataset( + '../data/balanced_new_csv.csv', '../../../test_CEL/slp/data/psychotherapy/', + max_word_length, + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): + trainer = trainer_factory(embeddings, idx2word, lex_size, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, idx2word, lex_size, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/examples/therapy_title_attentional_embed_on2.py b/examples/therapy_title_attentional_embed_on2.py new file mode 100644 index 0000000..0c6d3ca --- /dev/null +++ b/examples/therapy_title_attentional_embed_on2.py @@ -0,0 +1,160 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators_title import SequenceClassificationCollator +from slp.data.therapy_title_on2 import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.hier_att_net_title_attentional_embed import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer.trainer_title_no_validation import SequentialTrainer + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device=DEVICE) + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + sampler=val_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, idx2word, lex_size, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings, idx2word, lex_size) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.0005) + + metrics = { + 'accuracy': Accuracy(), + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir='../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 8 + batch_val = 8 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 8 + hidden_size = 300 + lex_size = 99 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) + to_tensor = ToTensor(device=DEVICE) + + bio = PsychologicalDataset( + '../data/balanced_new_csv.csv', '../../../test_CEL/slp/data/psychotherapy/', + max_word_length, + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): + trainer = trainer_factory(embeddings, idx2word, lex_size, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, idx2word, lex_size, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/slp/data/collators.py b/slp/data/collators.py index 222fa82..e38ae25 100644 --- a/slp/data/collators.py +++ b/slp/data/collators.py @@ -3,6 +3,7 @@ from slp.modules.util import pad_mask, subsequent_mask from slp.util import mktensor +from slp.data.transforms import ToTensor class SequenceClassificationCollator(object): @@ -11,15 +12,60 @@ def __init__(self, pad_indx=0, device='cpu'): self.device = device def __call__(self, batch): - inputs, targets = map(list, zip(*batch)) + inputs, titles, targets = map(list, zip(*batch)) lengths = torch.tensor([len(s) for s in inputs], device=self.device) + lengths_title = torch.tensor([len(t) for t in titles], device=self.device) + + # Pad and convert to tensor + inputs = (pad_sequence(inputs, + batch_first=True, + padding_value=self.pad_indx) + .to(self.device)) + titles = (pad_sequence(titles, + batch_first=True, + padding_value=self.pad_indx) + .to(self.device)) + + targets = mktensor(targets, device=self.device, dtype=torch.long) + return inputs, titles, targets.to(self.device), lengths, lengths_title + + +class BertCollator(object): + def __init__(self, pad_indx=0, device='cpu'): + self.pad_indx = pad_indx + self.device = device + + def __call__(self, batch): +# import pdb; pdb.set_trace() + inputs, targets = map(list, zip(*batch)) # Pad and convert to tensor inputs = (pad_sequence(inputs, batch_first=True, padding_value=self.pad_indx) .to(self.device)) targets = mktensor(targets, device=self.device, dtype=torch.long) - return inputs, targets.to(self.device), lengths + + attention_masks = [] + segments = [] + for seq in inputs: + seq_mask = [float(i>0) for i in seq] + attention_masks.append(seq_mask) + segm = [0] * len(seq) + segments.append(segm) + + masks = mktensor(attention_masks, device=self.device, dtype=torch.long) + segments = mktensor(segments, device=self.device, dtype=torch.long) + + return inputs, targets.to(self.device), masks.to(self.device), segments.to(self.device) + + + + + + + + + class TransformerCollator(object): diff --git a/slp/data/collators_title.py b/slp/data/collators_title.py index 296cac8..2178290 100644 --- a/slp/data/collators_title.py +++ b/slp/data/collators_title.py @@ -1,9 +1,9 @@ import torch -from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence +from torch.nn.utils.rnn import pack_padded_sequence , pad_sequence from slp.modules.util import pad_mask, subsequent_mask from slp.util import mktensor - +from slp.data.therapy_title import pad_sequence as pad_sequence1 class SequenceClassificationCollator(object): def __init__(self, pad_indx=0, device='cpu'): @@ -11,13 +11,22 @@ def __init__(self, pad_indx=0, device='cpu'): self.device = device def __call__(self, batch): + #import pdb; pdb.set_trace() + inputs, titles, targets = map(list, zip(*batch)) - lengths = torch.tensor([len(s) for s in inputs], device=self.device) + number_of_sentences = torch.tensor([len(s) for s in inputs], device=self.device) + length_of_sentences = ([torch.tensor([len(s) for s in inp]) for inp in inputs]) + + inputs = [pad_sequence1(i, padding_len=150, batch_first=True, padding_value=0) for i in inputs] + + # Pad and convert to tensor inputs = (pad_sequence(inputs, batch_first=True, padding_value=self.pad_indx) .to(self.device)) + length_of_sentences = pad_sequence1(length_of_sentences, padding_len=inputs.shape[1], batch_first=True, padding_value=1) + lengths_title = torch.tensor([len(t) for t in titles], device=self.device) titles = (pad_sequence(titles, batch_first=True, @@ -25,7 +34,7 @@ def __call__(self, batch): .to(self.device)) targets = mktensor(targets, device=self.device, dtype=torch.long) - return inputs, titles, targets.to(self.device), lengths, lengths_title + return inputs, titles, targets.to(self.device), number_of_sentences, length_of_sentences, lengths_title class TransformerCollator(object): diff --git a/slp/data/therapy.py b/slp/data/therapy.py index 16ba880..baac80a 100644 --- a/slp/data/therapy.py +++ b/slp/data/therapy.py @@ -10,9 +10,7 @@ def pad_sequence(sequences, batch_first=False, padding_len=None, padding_value=0): # assuming trailing dimensions and type of all the Tensors # in sequences are same and fetching those from sequences[0] -# import pdb; pdb.set_trace() max_size = sequences[0].size() - trailing_dims = max_size[1:] if padding_len is not None: max_len = padding_len @@ -117,29 +115,27 @@ def __getitem__(self, idx): preprocessed_text = self.transcript[idx] label = self.label[idx].split("; ") title = self.title[idx] - metadata = self.metadata[idx] -# import pdb; pdb.set_trace() + + + if self.text_transforms is not None: lista = [] turns = [] p = strip_tags(preprocessed_text) p = p.split("\n") - p1 = [x for x in p if x!=''] - p2 = [(x+ ' '+ y) for x,y in zip(p1[0::2], p1[1::2])] + p = [x for x in p if x!=''] - for i in p2: + for i in p: i = i.split(":") - if len(i) is not 1: + if len(i) == 2: turns.append(i[0]) lista.append(self.text_transforms(i[1])) - -# padding_len = len(max(lista, key=len)) -# preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=padding_len) + #import pdb; pdb.set_trace() preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=self.max_word_len) + preprocessed_title = self.text_transforms(title) lab = int("Depression (emotion)" in label) - return (preprocessed_text, lab) - + return (preprocessed_text, preprocessed_title, lab) diff --git a/slp/data/therapy_title.py b/slp/data/therapy_title.py index caae0ee..1d4a39b 100644 --- a/slp/data/therapy_title.py +++ b/slp/data/therapy_title.py @@ -73,6 +73,8 @@ def __init__(self, csv_file, root_dir, max_word_len=25, text_transforms=None): self.max_word_len = max_word_len self.text_transforms = text_transforms self.transcript, self.label, self.metadata, self.title = self.get_files_labels_metadata(self.root_dir, self.file) + self.patient_turns = ['CLIENT','PT','PATIENT','CL','Client','Danny','Juan', + 'PARTICIPANT','CG', 'RESPONDENT','F','Angie','Jeff', 'Bill'] def get_files_labels_metadata(self, root_dir, _file): @@ -124,21 +126,53 @@ def __getitem__(self, idx): turns = [] p = strip_tags(preprocessed_text) p = p.split("\n") - p1 = [x for x in p if x!=''] - p2 = [(x+ ' '+ y) for x,y in zip(p1[0::2], p1[1::2])] - - for i in p2: - i = i.split(":") - if len(i) is not 1: - turns.append(i[0]) - lista.append(self.text_transforms(i[1])) -# padding_len = len(max(lista, key=len)) -# preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=padding_len) - preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=self.max_word_len) - preprocessed_title = self.text_transforms(title) +# p2 = [(x+ ' '+ y) for x,y in zip(p1[0::2], p1[1::2])] #wrong + for i in p1: + i = i.split(":") + if len(i) != 1 and not '' in i: + if any(s in i[0] for s in self.patient_turns): + turns.append(i[0]) + lista.append(self.text_transforms(i[1])) + + + +#right for on2 for (i, j) in zip(p1[::2], p1[1::2]): +# i = i.split(":") +# j = j.split(":") +# if len(i)!= 1 and len(j)!= 1: +# turns.append(i[0]) +# turns.append(j[0]) +# d = i[1] + ' ' + j[1] +# lista.append(self.text_transforms(d)) + +# if len(lista) == 0: +# for (i, j) in zip(p1[::2], p1[1::2]): +# i = i.split(":") +# j = j.split(":") +# if len(i)!= 1: +# turns.append(i[0]) +# isum = i[1] +# else: +# isum = '' +# +# if len(j)!= 1: +# turns.append(j[0]) +# jsum = j[1] +# else: +# jsum = '' +# +# d = isum + ' ' + jsum +# lista.append(self.text_transforms(d)) +# if len(lista) == 0: +# import pdb; pdb.set_trace() +# + +# preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=self.max_word_len) + preprocessed_text = lista + preprocessed_title = self.text_transforms(title) lab = int("Depression (emotion)" in label) return (preprocessed_text, preprocessed_title, lab) diff --git a/slp/data/therapy_title_on2.py b/slp/data/therapy_title_on2.py new file mode 100644 index 0000000..23f170f --- /dev/null +++ b/slp/data/therapy_title_on2.py @@ -0,0 +1,170 @@ +import pandas as pd +import os +import csv + +from html.parser import HTMLParser +from sklearn.preprocessing import MultiLabelBinarizer +from torch.utils.data import Dataset + + +def pad_sequence(sequences, batch_first=False, padding_len=None, padding_value=0): + # assuming trailing dimensions and type of all the Tensors + # in sequences are same and fetching those from sequences[0] +# import pdb; pdb.set_trace() + max_size = sequences[0].size() + + trailing_dims = max_size[1:] + if padding_len is not None: + max_len = padding_len + else: + max_len = max([s.size(0) for s in sequences]) + if batch_first: + out_dims = (len(sequences), max_len) + trailing_dims + else: + out_dims = (max_len, len(sequences)) + trailing_dims + + out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value) + for i, tensor in enumerate(sequences): + if tensor.size(0) > padding_len: + tensor = tensor[:padding_len] + length = min(tensor.size(0), padding_len) + # use index notation to prevent duplicate references to the tensor + if batch_first: + out_tensor[i, :length, ...] = tensor + else: + out_tensor[:length, i, ...] = tensor + return out_tensor + + + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.strict = False + self.convert_charrefs= True + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def get_data(self): + return ''.join(self.fed) + +def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() + + +class TupleDataset(Dataset): + def __init__(self, data): + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + sample, label = self.data[idx] + return sample, label + + +class PsychologicalDataset(Dataset): + def __init__(self, csv_file, root_dir, max_word_len=25, text_transforms=None): + self.file = pd.read_csv(csv_file) + self.root_dir = root_dir + self.max_word_len = max_word_len + self.text_transforms = text_transforms + self.transcript, self.label, self.metadata, self.title = self.get_files_labels_metadata(self.root_dir, self.file) + self.patient_turns = ['CLIENT','PT','PATIENT','CL','Client','Danny','Juan', + 'PARTICIPANT','CG', 'RESPONDENT','F','Angie','Jeff', 'Bill'] + + + def get_files_labels_metadata(self, root_dir, _file): + included_cols = [1,11,12,13,14,15,16,20,22] + #included_cols_names = ['file name', 'session title', 'client gender', + # 'client age', 'client marital status', + # 'client sexual orientation', 'therapist gender', + # 'therapist experience', 'psych. subject', 'therapies'] + + transcripts = [] + labels = [] + title = [] + metadata = [] + + rows = [] + _file = "../data/balanced_new_csv.csv" + with open(_file) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + rows.append(list(row)) + + for i in range(len(rows)): + labels.append(rows[i][21]) + title.append(rows[i][5]) + metadata.append([rows[i][y] for y in included_cols]) + + for i in range(len(rows)): + f = int(float(rows[i][1])) + filename = str(os.path.join(self.root_dir, str(f))) + '.txt' + fp = open(filename,'r+') + transcript = fp.read() + transcripts.append(transcript) + + return transcripts, labels, metadata, title + + + def __len__(self): + return len(self.label) + + + def __getitem__(self, idx): + preprocessed_text = self.transcript[idx] + label = self.label[idx].split("; ") + title = self.title[idx] + metadata = self.metadata[idx] + + if self.text_transforms is not None: + lista = [] + turns = [] + p = strip_tags(preprocessed_text) + p = p.split("\n") + p1 = [x for x in p if x!=''] + + + for (i, j) in zip(p1[::2], p1[1::2]): + i = i.split(":") + j = j.split(":") + if len(i)!= 1 and len(j)!= 1: + turns.append(i[0]) + turns.append(j[0]) + d = i[1] + ' ' + j[1] + lista.append(self.text_transforms(d)) + + if len(lista) == 0: +# import pdb; pdb.set_trace() + for (i, j) in zip(p1[::2], p1[1::2]): + i = i.split(":") + j = j.split(":") + if len(i)!= 1: + turns.append(i[0]) + isum = i[1] + else: + isum = '' + + if len(j)!= 1: + turns.append(j[0]) + jsum = j[1] + else: + jsum = '' + + d = isum + ' ' + jsum + lista.append(self.text_transforms(d)) +# if len(lista) == 0: +# import pdb; pdb.set_trace() + + +# import pdb; pdb.set_trace() + preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=self.max_word_len) + preprocessed_title = self.text_transforms(title) + + lab = int("Depression (emotion)" in label) + return (preprocessed_text, preprocessed_title, lab) + diff --git a/slp/data/transforms.py b/slp/data/transforms.py index 8ae2f22..3f9a7da 100644 --- a/slp/data/transforms.py +++ b/slp/data/transforms.py @@ -1,5 +1,6 @@ import spacy import torch +import re import sentencepiece as spm from transformers import BertTokenizer @@ -9,6 +10,11 @@ from slp.util import mktensor +def remove_punctuation(txt): + ch = "[.?:_'!,)(]" + txt = re.sub(ch, '', txt) + return txt + class SentencepieceTokenizer(object): def __init__( self, diff --git a/slp/load_lexicons/get_BL_features.py b/slp/load_lexicons/get_BL_features.py new file mode 100755 index 0000000..95552b0 --- /dev/null +++ b/slp/load_lexicons/get_BL_features.py @@ -0,0 +1,69 @@ +import os + +BASE_DIR = '../data/' + + +# Opinion Lexicon (or Sentiment Lexicon) - Bing Liu (~6.800 entries) +# -------------------------------------- +# format = dictionary with entries like this: +# word1={'positive': 1, 'negative': 0} +# word2={'positive': 0, 'negative': 1} + +def load_bingliu_lexicon(neg_file, pos_file): + + # returns Bing Liu Opinion lexicon in the form of a dictionary + # keys: words, values: "positive" or "negative" + + _data = {} + + # negative words + lines = open(neg_file, "r", encoding="utf-8").readlines() + lines = lines[35:] + + total_neg_words = len(lines) + + for line_id, line in enumerate(lines): + _row = line.rstrip().split('\t') + _word = _row[0] + _feature = "negative" + _data[_word] = _feature + + + # positive words + lines = open(pos_file, "r", encoding="utf-8").readlines() + lines = lines[35:] + + total_pos_words = len(lines) + cnt = 0 + for line_id, line in enumerate(lines): + _row = line.rstrip().split('\t') + _word = _row[0] + + if _word in _data.keys(): + cnt += 1 + _feature = "positive" + _data[_word] = _feature + + return _data, cnt, total_neg_words, total_pos_words + +#################################################### +# Load Bing Liu Opinion Lexicon +#################################################### + +# get the Bing Liu Opinion Lexicon in the form of a dictionary +# where keys are the unique words +# and values a scalar + +def bing_liu(): + # BL_LEX_PATH = os.path.join(BASE_DIR, 'lexicons_kate', 'Bing_Liu_opinion_lex') + BL_LEX_PATH = BASE_DIR + lexicon, both_pos_neg, neg_words, pos_words = load_bingliu_lexicon(neg_file=os.path.join(BL_LEX_PATH, 'negative-words.txt'), + pos_file=os.path.join(BL_LEX_PATH, 'positive-words.txt')) + lex = {} + for word in lexicon: + if lexicon[word] == 'negative': + lex[word] = [-1.] + elif lexicon[word] == 'positive': + lex[word] = [1.] + return lex + diff --git a/slp/load_lexicons/get_afinn_features.py b/slp/load_lexicons/get_afinn_features.py new file mode 100755 index 0000000..b88ac71 --- /dev/null +++ b/slp/load_lexicons/get_afinn_features.py @@ -0,0 +1,55 @@ +# AFINN is a list of English words rated for valence with an integer +# between minus five (negative) and plus five (positive). The words have +# been manually labeled by Finn Ã…rup Nielsen in 2009-2011. The file +# is tab-separated. Total words: 2477. + +import os + +BASE_DIR = '../data/' + + +def load_afinn_lexicon(): + + # returns AFINN lexicon in the form of a dictionary + # keys: words, values: valence score (integer -5 to +5) + +# file = os.path.join(BASE_DIR, 'lexicons_kate', 'AFINN', 'AFINN-111.txt') + file = os.path.join(BASE_DIR, 'AFINN-111.txt') + + _data = {} + + lines = open(file, "r", encoding="utf-8").readlines() + for line_id, line in enumerate(lines): + _row = line.rstrip().split('\t') + _word = _row[0] + _feature = _row[1] + _data[_word] = _feature + return _data + +def load_features(file): + + print("edw") + dim2num = {} # [dimension name]: corresponding number in lexicon list + num2dim = {} # the exact opposite + + lines = open(file, "r", encoding="utf-8").readlines() + for line_id, line in enumerate(lines): + _row = line.rstrip().split(" ") + _dim = _row[1] + dim2num[_dim] = line_id + num2dim[line_id] = _dim + return dim2num, num2dim + +#################################################### +# Load AFINN Lexicon +#################################################### + +# get the AFINN lexicon in the form of a dictionary +# where keys are the unique words +# and values a scalar +# +# total_words = len(lex) + + + + diff --git a/slp/load_lexicons/get_all_6lexicons.py b/slp/load_lexicons/get_all_6lexicons.py new file mode 100644 index 0000000..8681c78 --- /dev/null +++ b/slp/load_lexicons/get_all_6lexicons.py @@ -0,0 +1,75 @@ +import os +import torch + +import torch.nn as nn + +from get_afinn_features import load_afinn_lexicon +from get_BL_features import * +from get_liwc_features import load_liwc_lex , load_features +from get_mpqa_features import * +from get_semeval2015_twitter_features import * +from get_nrc_emolex_features import * +from slp.data.therapy_title import pad_sequence +from slp.data.transforms import ToTensor + +BASE_DIR = '../data/' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' +#DEVICE = 'cpu' + +class LexiconFeatures(nn.Module): + def __init__(self): + super(LexiconFeatures, self).__init__() + + self.afinn = load_afinn_lexicon() + self.BL = bing_liu() + self.liwc = load_liwc_lex() + self.liwc_classes, _ = load_features(os.path.join(BASE_DIR, 'PsycholinguisticDimensions.txt')) + self.mpqa = mpqa_lex() + self.semeval = semeval15_lexicon() + self.emolex = emolex() + + def forward(self, inputs, idx2word, padding_len): + to_tensor = ToTensor(device=DEVICE) + final_vector = [] +# import pdb; pdb.set_trace() + for inputt in inputs: + vec = [] + for inp in inputt: + word = idx2word[inp.item()] + vector = [] + if word in self.afinn: + vector.append(float(self.afinn[word])) + else: + vector.append(float(0)) + if word in self.semeval: + vector.append(self.semeval[word]) + else: + vector.append(float(0)) + if word in self.BL: + try: + vector.append(float(self.BL[word][0])) + except: + import pdb; pdb.set_trace() + else: + vector.append(float(0)) + if word in self.mpqa: + vector.extend(self.mpqa[word]) + else: + vector.extend([float(0)]*4) + if word in self.liwc: + v = [float(i) for i in self.liwc[word]] + vector.extend(v) + else: + vector.extend([float(0)]*73) + if word in self.emolex: + v = [float(i) for i in self.emolex[word]] + vector.extend(v) + else: + vector.extend([float(0)]*19) + vec.append(vector) + try: + final_vector.append(to_tensor(vec)) + except: + import pdb; pdb.set_trace() + final_vector = pad_sequence(final_vector, padding_len=padding_len, batch_first=True) + return final_vector diff --git a/slp/load_lexicons/get_all_lexicons.py b/slp/load_lexicons/get_all_lexicons.py new file mode 100644 index 0000000..dad58c0 --- /dev/null +++ b/slp/load_lexicons/get_all_lexicons.py @@ -0,0 +1,75 @@ +import os +import torch + +import torch.nn as nn + +from get_afinn_features import load_afinn_lexicon +from get_BL_features import * +from get_liwc_features import load_liwc_lex , load_features +from get_mpqa_features import * +from get_semeval2015_twitter_features import * +from get_nrc_emolex_features import * +from slp.data.therapy_title import pad_sequence +from slp.data.transforms import ToTensor + +BASE_DIR = '../data/' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +#def LexiconFeatures(inputs): + +class LexiconFeatures(nn.Module): + def __init__(self): + super(LexiconFeatures, self).__init__() + + self.afinn = load_afinn_lexicon() + self.BL = bing_liu() + self.liwc = load_liwc_lex() + self.liwc_classes, _ = load_features(os.path.join(BASE_DIR, 'PsycholinguisticDimensions.txt')) + self.mpqa = mpqa_lex() + self.semeval = semeval15_lexicon() + self.emolex = emolex() + + def forward(self, inputs, idx2word, padding_len): + to_tensor = ToTensor(device=DEVICE) + final_vector = [] +# import pdb; pdb.set_trace() + for inputt in inputs: + vec = [] + for inp in inputt: + word = idx2word[inp] + vector = [] + inp = inp.item() + if word in self.afinn: + vector.append(float(self.afinn[word])) + else: + vector.append(float(0)) + if word in self.semeval: + vector.append(self.semeval[word]) + else: + vector.append(float(0)) + if word in self.BL: +# import pdb; pdb.set_trace() + vector.append(float(self.BL[word[0])) + else: + vector.append(float(0)) + if word in self.mpqa: + vector.extend(self.mpqa[word]) + else: + vector.extend([float(0)]*4) + if word in self.liwc: + v = [float(i) for i in self.liwc[word]] + vector.extend(v) + else: + vector.extend([float(0)]*73) + if word in self.emolex: + v = [float(i) for i in self.emolex[word]] + vector.extend(v) + else: + vector.extend([float(0)]*19) + vec.append(vector) + try: + final_vector.append(to_tensor(vec)) + except: + import pdb; pdb.set_trace() + final_vector = pad_sequence(final_vector, padding_len=padding_len, batch_first=True) + return final_vector diff --git a/slp/load_lexicons/get_liwc_features.py b/slp/load_lexicons/get_liwc_features.py new file mode 100755 index 0000000..04d8343 --- /dev/null +++ b/slp/load_lexicons/get_liwc_features.py @@ -0,0 +1,99 @@ +import os + +#from sys_config import BASE_DIR +import matplotlib.pyplot as plt +import seaborn as sns + +BASE_DIR = '../data/' +# LIWC Lexicon http://lit.eecs.umich.edu/~geoliwc/LIWC_Dictionary.htm + +def load_liwc_lexicon(file): + # returns LIWC in the form of a dictionary + # keys: words, values: feature vector (list) + + + _data = {} + + lines = open(file, "r", encoding="utf-8").readlines() + for line_id, line in enumerate(lines): + _row = line.rstrip().split(" ") + _word = _row[0] + _features = _row[1:] + _data[_word] = _features + return _data + + +def load_features(file): + + dim2num = {} # [dimension name]: corresponding number in lexicon list + num2dim = {} # the exact opposite + + lines = open(file, "r", encoding="utf-8").readlines() + for line_id, line in enumerate(lines): + _row = line.rstrip().split(" ") + _dim = _row[1] + dim2num[_dim] = line_id + num2dim[line_id] = _dim + return dim2num, num2dim + + +#################################################### +# Load LIWC Lexicon +#################################################### + +def liwc_lex(): + # get the liwc lexicon in the form of a dictionary + # where keys are the unique words + # and values a list with all the dimensions (73 in total) + + + lex = load_liwc_lexicon( + os.path.join(BASE_DIR, 'PsycholinguisticLexicon.txt')) + + total_words = len(lex) + + # get the two dictionaries that relate every dimension name + # with its corresponding number (value) in the lexicon dimension list + dim2num, num2dim = load_features( + os.path.join(BASE_DIR, 'PsycholinguisticDimensions.txt')) + + #################################################### + # Plot statistics of LIWC Lexicon + #################################################### + + # The lexiconss has 18504 words and for each word a feature vector of size 71. + # Each dimension represents a category (for example affect, posemo, negemo etc) + # The vector contains '1' when this word is includied in the particular category. + # Otherwise '0'. + # Using a bar plot we can decide which dimensions of this feature vector are useful for our work. + + # initialization of count dictionary + dimensions = list(dim2num.keys()) + dim_counts = {dim: 0 for dim in dimensions} + + for word in lex: + ones = [i for i, x in enumerate(lex[word]) if x == '1'] + for index in ones: + dim_counts[num2dim[index]] += 1 + + sorted_tuples = sorted(dim_counts.items(), key=lambda kv: kv[1]) + + x = [k[1] for k in sorted_tuples if k[1] > 500] + y = [k[0] for k in sorted_tuples if k[1] > 500] + + + plt.figure() + sns.barplot(x=x, y=y) + plt.title('Number of words for each dimension of the LIWC lexicon') + # plt.show() + plt.savefig('liwc_dims_statistics.png') + # plt.close() + + print(len(lex)) + + +def load_liwc_lex(): + return load_liwc_lexicon( + os.path.join(BASE_DIR, 'PsycholinguisticLexicon.txt')) + + # liwc_lex() diff --git a/slp/load_lexicons/get_mpqa_features.py b/slp/load_lexicons/get_mpqa_features.py new file mode 100755 index 0000000..e8b79d9 --- /dev/null +++ b/slp/load_lexicons/get_mpqa_features.py @@ -0,0 +1,63 @@ +import os +import pickle + +BASE_DIR = '../data/' + +def mpqa_lex(): + path = os.path.join(BASE_DIR, 'mpqa.pickle') + with open(path, 'rb') as f: + data = pickle.load(f) + + pos = list(data["reinforcement"].keys())[0] + + # dictionary in the following form: + # {'word': + # {'POS': + # {'strength':weaksubj or strongsubj, + # 'positive': 0 or 1, + # 'negative': 0 or 1, + # 'polarity': 0 or 1}}} + + polarities = [] + strengths = [] + pos_tags = [] + negatives = [] + positives = [] + lexicon = {} + feat_lexicon = {} + for key in data: + pos = list(data[key].keys())[0] + lexicon[key] = {'pos': pos, + 'strength': data[key][pos]['strength'], + 'positive': data[key][pos]['positive'], + 'negative': data[key][pos]['negative'], + 'polarity': data[key][pos]['polarity']} + polarities.append(data[key][pos]['polarity']) + pos_tags.append(pos) + negatives.append(data[key][pos]['negative']) + positives.append(data[key][pos]['positive']) + strengths.append(data[key][pos]['strength']) + + # first we add to the feature vector the subjectivity + if data[key][pos]['strength'] == "strongsubj": + feat_lexicon[key] = [1.0] + elif data[key][pos]['strength'] == "weaksubj": + feat_lexicon[key] = [0.0] + # then, the polarity + feat_lexicon[key].append(float(data[key][pos]['polarity'])) + # then, the positivity + feat_lexicon[key].append(float(data[key][pos]['positive'])) + # and finally the negativity + feat_lexicon[key].append(float(data[key][pos]['positive'])) + # print(len(lexicon)) + + # now it is in the form: { word:{'pos':_, 'positive':_, 'negative':_, 'polarity':_} } + # polarity: -2 to +2 + # pos: 'NOUN', 'ADJ', 'ADV', 'VERB', '_' + # strength: weaksubj or strongsubj + # positive/negative: 0 or 1 + + # the lists are for statistics. total words: 6886 + + return feat_lexicon + diff --git a/slp/load_lexicons/get_nrc_emolex_features.py b/slp/load_lexicons/get_nrc_emolex_features.py new file mode 100755 index 0000000..8dce1ef --- /dev/null +++ b/slp/load_lexicons/get_nrc_emolex_features.py @@ -0,0 +1,27 @@ +import os +import pickle + +# NRC Emotion Lexicon (Emolex) +# Total words: 14,182 +# dictionary: {word: {'fear':_, 'joy':_, 'positive':_, 'emotions':(list of len 8), 'sadness':_, +# 'negative':_, 'anticipation':_, 'polarity':_, 'anger':_, 'disgust':_, 'trust':_, 'surprise':_}} + +BASE_DIR = '../data/' + +def emolex(): + path = os.path.join(BASE_DIR, 'emolex.pickle') + with open(path, 'rb') as f: + data = pickle.load(f) + + lex = {} + for word in data: + features = [] + for key in data[word]: + if not isinstance(data[word][key], list): + features.append(data[word][key]) + else: + features += data[word][key] + lex[word]=features + + + return lex diff --git a/slp/load_lexicons/get_semeval2015_twitter_features.py b/slp/load_lexicons/get_semeval2015_twitter_features.py new file mode 100755 index 0000000..04a2d0c --- /dev/null +++ b/slp/load_lexicons/get_semeval2015_twitter_features.py @@ -0,0 +1,18 @@ +import os +import pickle + +BASE_DIR = '../data/' + +# SemEval-2015 English Twitter Sentiment Lexicon +# aka NRC MaxDiff Twitter Sentiment Lexicon +# Total words: 1515 (including hashtags like #ew) +# dictionary: {word: real value -1 to +1, representing negative/positive sentiment} + +def semeval15_lexicon(): + path = os.path.join(BASE_DIR, 'SemEval2015-English-Twitter-Lexicon.pickle') + with open(path, 'rb') as f: + data = pickle.load(f) + return data + + +# print(len(data)) diff --git a/slp/modules/helpers.py b/slp/modules/helpers.py index 334c1c3..e476954 100644 --- a/slp/modules/helpers.py +++ b/slp/modules/helpers.py @@ -9,8 +9,9 @@ def __init__(self, batch_first=True): super(PadPackedSequence, self).__init__() self.batch_first = batch_first - def forward(self, x, lengths): - max_length = lengths.max().item() + def forward(self, x, lengths, max_length): +# import pdb; pdb.set_trace() +# max_length = lengths.max().item() x, _ = pad_packed_sequence( x, batch_first=self.batch_first, total_length=max_length) return x @@ -22,6 +23,7 @@ def __init__(self, batch_first=True): self.batch_first = batch_first def forward(self, x, lengths): +# import pdb; pdb.set_trace() x = pack_padded_sequence( x, lengths, batch_first=self.batch_first, diff --git a/slp/modules/hier_att_net_title.py b/slp/modules/hier_att_net_title.py index dec0220..4b1c3cc 100644 --- a/slp/modules/hier_att_net_title.py +++ b/slp/modules/hier_att_net_title.py @@ -4,6 +4,7 @@ from slp.modules.helpers import PackSequence, PadPackedSequence from slp.data.therapy import pad_sequence +from slp.load_lexicons.get_all_lexicons import LexiconFeatures #DEVICE = 'cpu' DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' @@ -22,7 +23,7 @@ class WordAttNet(nn.Module): def __init__(self, dict_size, diction, hidden_size=300): super(WordAttNet, self).__init__() - self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) + self.gru = nn.GRU(380, 380, bidirectional = True, batch_first=True) #changed hidden & input size. self.word = nn.Linear(2 * hidden_size, 2 * hidden_size) self.context = nn.Linear(2 * hidden_size, 1, bias=False) @@ -31,11 +32,15 @@ def __init__(self, dict_size, diction, hidden_size=300): self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) self.pack = PackSequence(batch_first=True) self.unpack = PadPackedSequence(batch_first=True) + self.lexicons = LexiconFeatures() - - def forward(self, inputs, lengths, hidden_state): + def forward(self, inputs, lengths, hidden_state, idx2word): output = self.lookup(inputs) - + + output_lex = self.lexicons(inputs, idx2word, padding_len=output.shape[1]).float() + + output = torch.cat((output, output_lex), axis=2) #to concatenation tis eisodou me ta lexica + output, lengths = self.pack(output,lengths) f_output, h_output = self.gru(output.float(), hidden_state) f_output = self.unpack(f_output, lengths) @@ -51,7 +56,7 @@ class SentAttNet(nn.Module): def __init__(self,hidden_size=300, num_classes=0): super(SentAttNet, self).__init__() num_classes = num_classes - self.gru = nn.GRU(2 * hidden_size, 300, bidirectional=True, batch_first=True) + self.gru = nn.GRU(2 * hidden_size, 380, bidirectional=True, batch_first=True) #changed hidden size self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) self.context = nn.Linear(2 * hidden_size, 1, bias=False) @@ -66,7 +71,6 @@ def forward(self, inputs, lengths, titles, hidden_state): f_output, h_output = self.gru(f_output, hidden_state) f_output = self.unpack(f_output, lengths) - titles = torch.unsqueeze(titles, dim=1) f_output = torch.cat((f_output,titles), dim=1) @@ -80,12 +84,13 @@ def forward(self, inputs, lengths, titles, hidden_state): class HierAttNet(nn.Module): - def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction, idx2word): super (HierAttNet, self).__init__() self.num_classes = num_classes self.batch_size = batch_size self.hidden_size = hidden_size + self.idx2word = idx2word self.sent_att_net = SentAttNet(self.hidden_size, num_classes) self.word_att_net_text = WordAttNet(dict_size, diction, hidden_size) @@ -116,18 +121,15 @@ def forward(self, inputs, lengths, titles, title_lengths): if word_lengths[k] == 0: word_lengths[k] = 1 - output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, self.word_hidden_state) #[8,600] + output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, self.word_hidden_state, self.idx2word) #[8,600] output_list_text.append(output_text) - self.word_hidden_state = repackage_hidden(self.word_hidden_state) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) - output_title, self.word_hidden_state = self.word_att_net_text(titles, title_lengths, self.word_hidden_state) #[8,600] + output_title, self.word_hidden_state = self.word_att_net_text(titles, title_lengths, self.word_hidden_state, self.idx2word) #[8,600] self.word_hidden_state = repackage_hidden(self.word_hidden_state) - - - - # output_list_text = (S, B, 600) - + # output_list_text = (S, B, 760) +# import pdb; pdb.set_trace() output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) output, self.sent_hidden_state = self.sent_att_net(output_list_text, lengths, output_title, self.sent_hidden_state) self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) diff --git a/slp/modules/hier_att_net_title_attentional_embed.py b/slp/modules/hier_att_net_title_attentional_embed.py new file mode 100644 index 0000000..32086f7 --- /dev/null +++ b/slp/modules/hier_att_net_title_attentional_embed.py @@ -0,0 +1,172 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence +from slp.load_lexicons.get_all_6lexicons import LexiconFeatures + +DEVICE = 'cpu' +#DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, lex_size, hidden_size=300): + super(WordAttNet, self).__init__() + + self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) #changed hidden & input size. + + self.word = nn.Linear(2 * hidden_size + lex_size, 2 * hidden_size + lex_size) + self.context = nn.Linear(2 * hidden_size + lex_size, 1, bias=False) + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + self.lexicons = LexiconFeatures() + + def forward(self, inputs, lengths, hidden_state, idx2word, lex_size, is_title=False): + output_emb = self.lookup(inputs) + + import pdb; pdb.set_trace() + + output, lengths = self.pack(output_emb ,lengths) + try: + f_output, h_output = self.gru(output.float(), hidden_state) + except: + for i in output[0]: + if i.shape[0]!=300: + import pdb; pdb.set_trace() + f_output = self.unpack(f_output, lengths, max_length = 150) + +# f_output = f_output.view(batch_size, num_sentences, -1) + + if is_title == False: + output_lex = self.lexicons(inputs, idx2word, padding_len=f_output.shape[1]).float() + f_output = torch.cat((f_output, output_lex), axis=2) + else: + import pdb; pdb.set_trace() + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self, lex_size, hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size + lex_size, 300, bidirectional=True, batch_first=True) #changed hidden size + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): +# import pdb; pdb.set_trace() + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + +# titles = torch.unsqueeze(titles, dim=1) #try without title +# f_output = torch.cat((f_output,titles), dim=1) + + + output = self.sent(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + output = self.fc(output).squeeze() + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction, idx2word, lex_size): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + self.idx2word = idx2word + self.lex_size = lex_size + + self.sent_att_net = SentAttNet(lex_size, self.hidden_size, num_classes) + self.word_att_net_text = WordAttNet(dict_size, diction, lex_size, hidden_size) + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, number_of_sentences, length_of_sentences, titles, title_lengths): + # inputs = (B, S, W) + import pdb; pdb.set_trace() + text = inputs.permute(1, 0, 2) + all_word_lengths = [] + + batch_size = inputs.size(0) + num_sentences = inputs.size(1) + inputs = inputs.view(batch_size * num_sentences, -1) + length_of_sentences = length_of_sentences.view(-1) + + temp = inputs[:100] + temp_lengths = length_of_sentences[:100] + + output_text, self.word_hidden_state = self.word_att_net_text(temp, temp_lengths, self.word_hidden_state, self.idx2word, self.lex_size, is_title=False) + + +# ----- +# for i in text: + +# word_lengths = i.size(1) - (i==0).sum(dim=1) # to mikos tis kathe protasis apo tis 8 tou batch kathe fora +# if 0 in word_lengths: +# for k in range(0, inputs.size()[0]): +# if word_lengths[k] == 0: +# word_lengths[k] = 1 +# all_word_lengths.append(word_lengths) +# output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, self.word_hidden_state, self.idx2word, self.lex_size, is_title=False) #[8,600] +# output_list_text.append(output_text) +# self.word_hidden_state = repackage_hidden(self.word_hidden_state) + +# ----- + + +# output_title, self.word_hidden_state = self.word_att_net_text(titles, title_lengths, # try without title +# self.word_hidden_state, self.idx2word, +# self.lex_size, is_title=True) #[8,600] +# self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + # output_list_text = (S, B, 600) + # output_list_text = (S, B, 760) +# import pdb; pdb.set_trace() + + output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, number_of_sentences, self.sent_hidden_state) + self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/modules/hier_att_net_title_attentional_embed2.py b/slp/modules/hier_att_net_title_attentional_embed2.py new file mode 100644 index 0000000..a9cb4d6 --- /dev/null +++ b/slp/modules/hier_att_net_title_attentional_embed2.py @@ -0,0 +1,149 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence +from slp.load_lexicons.get_all_6lexicons import LexiconFeatures + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, lex_size, hidden_size=300): + super(WordAttNet, self).__init__() + + self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) #changed hidden & input size. + + self.word = nn.Linear(2 * hidden_size + lex_size, 2 * hidden_size + lex_size) + self.context = nn.Linear(2 * hidden_size + lex_size, 1, bias=False) + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + self.lexicons = LexiconFeatures() + + def forward(self, inputs, lengths, hidden_state, idx2word, lex_size, is_title=False): + output_emb = self.lookup(inputs) + +# import pdb; pdb.set_trace() + + output, lengths = self.pack(output_emb ,lengths) + f_output, h_output = self.gru(output.float(), hidden_state) + f_output = self.unpack(f_output, lengths) + + if is_title == False: + output_lex = self.lexicons(inputs, idx2word, padding_len=f_output.shape[1]).float() + f_output = torch.cat((f_output, output_lex), axis=2) + else: + import pdb; pdb.set_trace() + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self, lex_size, hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size + lex_size, 300, bidirectional=True, batch_first=True) #changed hidden size + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): +# import pdb; pdb.set_trace() + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + +# titles = torch.unsqueeze(titles, dim=1) #try without title +# f_output = torch.cat((f_output,titles), dim=1) + + + output = self.sent(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + output = self.fc(output).squeeze() + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction, idx2word, lex_size): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + self.idx2word = idx2word + self.lex_size = lex_size + + self.sent_att_net = SentAttNet(lex_size, self.hidden_size, num_classes) + self.word_att_net_text = WordAttNet(dict_size, diction, lex_size, hidden_size) + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, lengths, titles, title_lengths): + # inputs = (B, S, W) + import pdb; pdb.set_trace() + text = inputs.permute(1, 0, 2) + all_word_lengths = [] + for i in text: + + word_lengths = i.size(1) - (i==0).sum(dim=1) + if 0 in word_lengths: + for k in range(0, inputs.size()[0]): + if word_lengths[k] == 0: + word_lengths[k] = 1 + all_word_lengths.append(word_lengths) + output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, self.word_hidden_state, self.idx2word, self.lex_size, is_title=False) #[8,600] + output_list_text.append(output_text) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + +# output_title, self.word_hidden_state = self.word_att_net_text(titles, title_lengths, # try without title +# self.word_hidden_state, self.idx2word, +# self.lex_size, is_title=True) #[8,600] +# self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + # output_list_text = (S, B, 600) + # output_list_text = (S, B, 760) +# import pdb; pdb.set_trace() + + output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, lengths, self.sent_hidden_state) + self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/trainer/trainer.py b/slp/trainer/trainer.py index 48bddfa..f4ee265 100644 --- a/slp/trainer/trainer.py +++ b/slp/trainer/trainer.py @@ -154,7 +154,8 @@ def train_step(self: TrainerType, engine: Engine, batch: List[torch.Tensor]) -> float: self.model.train() - #import pdb; pdb.set_trace() + # import pdb; pdb.set_trace() + y_pred, targets = self.get_predictions_and_targets(batch) loss = self.loss_fn(y_pred, targets.long()) # type: ignore if self.parallel: @@ -173,6 +174,7 @@ def eval_step( batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: self.model.eval() with torch.no_grad(): + #import pdb; pdb.set_trace() y_pred, targets = self.get_predictions_and_targets(batch) return y_pred, targets @@ -275,7 +277,68 @@ def parse_batch( return inputs, inputs + class SequentialTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + titles = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + len_inputs = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + len_titles = to_device(batch[4], + device=self.device, + non_blocking=self.non_blocking) + return inputs, titles, targets, len_inputs, len_titles + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, titles, targets, len_inputs, len_titles = self.parse_batch(batch) + y_pred = self.model(inputs, len_inputs) + import pdb; pdb.set_trace() + return y_pred, targets + + +class SequentialTrainerTitle(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + titles = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + len_inputs = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + len_titles = to_device(batch[4], + device=self.device, + non_blocking=self.non_blocking) + return inputs, titles, targets, len_inputs, len_titles + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, titles, targets, len_inputs, len_titles = self.parse_batch(batch) + y_pred = self.model(inputs, len_inputs) + import pdb; pdb.set_trace() + return y_pred, targets + + +class BertTrainer(Trainer): def parse_batch( self, batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: @@ -285,18 +348,22 @@ def parse_batch( targets = to_device(batch[1], device=self.device, non_blocking=self.non_blocking) - lengths = to_device(batch[2], + masks = to_device(batch[2], device=self.device, non_blocking=self.non_blocking) - return inputs, targets, lengths + segments = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + return inputs, targets, masks, segments def get_predictions_and_targets( self, batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: - inputs, targets, lengths = self.parse_batch(batch) + inputs, targets, masks, segments = self.parse_batch(batch) #import pdb; pdb.set_trace() - y_pred = self.model(inputs, lengths) - return y_pred, targets + logits = self.model(inputs, token_type_ids=segments, attention_mask=masks) + + return logits, targets class Seq2seqTrainer(SequentialTrainer): diff --git a/slp/trainer/trainer_title.py b/slp/trainer/trainer_title.py index 09e7d2a..f944752 100644 --- a/slp/trainer/trainer_title.py +++ b/slp/trainer/trainer_title.py @@ -301,6 +301,7 @@ def get_predictions_and_targets( self, batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: inputs, titles, targets, lengths, title_lengths = self.parse_batch(batch) + # import pdb; pdb.set_trace() y_pred = self.model(inputs, lengths, titles, title_lengths) return y_pred, targets diff --git a/slp/trainer/trainer_title_no_validation.py b/slp/trainer/trainer_title_no_validation.py new file mode 100644 index 0000000..c9fef5f --- /dev/null +++ b/slp/trainer/trainer_title_no_validation.py @@ -0,0 +1,359 @@ +import os +from typing import Union +import torch +import torch.nn as nn + +from ignite.handlers import EarlyStopping +from ignite.contrib.handlers import ProgressBar +from ignite.engine import Engine, Events, State +from ignite.metrics import RunningAverage, Loss + +from torch.optim.optimizer import Optimizer +from torch.nn.modules.loss import _Loss +from torch.utils.data import DataLoader + +from typing import cast, List, Optional, Tuple, TypeVar +from slp.util import types +from slp.util.parallel import DataParallelModel, DataParallelCriterion + +from slp.trainer.handlers import CheckpointHandler, EvaluationHandler +from slp.util import from_checkpoint, to_device +from slp.util import log +from slp.util import system + + +TrainerType = TypeVar('TrainerType', bound='Trainer') + + +class Trainer(object): + def __init__(self: TrainerType, + model: nn.Module, + optimizer: Optimizer, + checkpoint_dir: str = '../../checkpoints', + experiment_name: str = 'experiment', + model_checkpoint: Optional[str] = None, + optimizer_checkpoint: Optional[str] = None, + metrics: types.GenericDict = None, + patience: int = 10, + validate_every: int = 1, + accumulation_steps: int = 1, + loss_fn: Union[_Loss, DataParallelCriterion] = None, + non_blocking: bool = True, + retain_graph: bool = False, + dtype: torch.dtype = torch.float, + device: str = 'cpu', + parallel: bool = False) -> None: + self.dtype = dtype + self.retain_graph = retain_graph + self.non_blocking = non_blocking + self.device = device + self.loss_fn = loss_fn + self.validate_every = validate_every + self.patience = patience + self.accumulation_steps = accumulation_steps + self.checkpoint_dir = checkpoint_dir + + model_checkpoint = self._check_checkpoint(model_checkpoint) + optimizer_checkpoint = self._check_checkpoint(optimizer_checkpoint) + + self.model = cast(nn.Module, from_checkpoint( + model_checkpoint, model, map_location=torch.device('cpu'))) + self.model = self.model.type(dtype).to(device) + self.optimizer = from_checkpoint(optimizer_checkpoint, optimizer) + self.parallel = parallel + if parallel: + if device == 'cpu': + raise ValueError("parallel can be used only with cuda device") + self.model = DataParallelModel(self.model).to(device) + self.loss_fn = DataParallelCriterion(self.loss_fn) # type: ignore + if metrics is None: + metrics = {} + if 'loss' not in metrics: + if self.parallel: + metrics['loss'] = Loss( + lambda x, y: self.loss_fn(x, y).mean()) # type: ignore + else: + metrics['loss'] = Loss(self.loss_fn) + self.trainer = Engine(self.train_step) + #self.train_evaluator = Engine(self.eval_step) + self.valid_evaluator = Engine(self.eval_step) + for name, metric in metrics.items(): + #metric.attach(self.train_evaluator, name) + metric.attach(self.valid_evaluator, name) + + self.pbar = ProgressBar() + self.val_pbar = ProgressBar(desc='Validation') + + if checkpoint_dir is not None: + self.checkpoint = CheckpointHandler( + checkpoint_dir, experiment_name, score_name='validation_loss', + score_function=self._score_fn, n_saved=2, + require_empty=False, save_as_state_dict=True) + + self.early_stop = EarlyStopping( + patience, self._score_fn, self.trainer) + + self.val_handler = EvaluationHandler(pbar=self.pbar, + validate_every=1, + early_stopping=self.early_stop) + self.attach() + log.info( + f'Trainer configured to run {experiment_name}\n' + f'\tpretrained model: {model_checkpoint} {optimizer_checkpoint}\n' + f'\tcheckpoint directory: {checkpoint_dir}\n' + f'\tpatience: {patience}\n' + f'\taccumulation steps: {accumulation_steps}\n' + f'\tnon blocking: {non_blocking}\n' + f'\tretain graph: {retain_graph}\n' + f'\tdevice: {device}\n' + f'\tmodel dtype: {dtype}\n' + f'\tparallel: {parallel}') + + def _check_checkpoint(self: TrainerType, + ckpt: Optional[str]) -> Optional[str]: + if ckpt is None: + return ckpt + if system.is_url(ckpt): + ckpt = system.download_url(cast(str, ckpt), self.checkpoint_dir) + ckpt = os.path.join(self.checkpoint_dir, ckpt) + return ckpt + + @staticmethod + def _score_fn(engine: Engine) -> float: + """Returns the scoring metric for checkpointing and early stopping + + Args: + engine (ignite.engine.Engine): The engine that calculates + the val loss + + Returns: + (float): The validation loss + """ + negloss: float = -engine.state.metrics['loss'] + return negloss + + def parse_batch( + self: TrainerType, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + return inputs, targets + + def get_predictions_and_targets( + self: TrainerType, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, targets = self.parse_batch(batch) + y_pred = self.model(inputs) + return y_pred, targets + + def train_step(self: TrainerType, + engine: Engine, + batch: List[torch.Tensor]) -> float: + self.model.train() + #import pdb; pdb.set_trace() + y_pred, targets = self.get_predictions_and_targets(batch) + loss = self.loss_fn(y_pred, targets.long()) # type: ignore + if self.parallel: + loss = loss.mean() + loss = loss / self.accumulation_steps + loss.backward(retain_graph=self.retain_graph) + if (self.trainer.state.iteration + 1) % self.accumulation_steps == 0: + self.optimizer.step() # type: ignore + self.optimizer.zero_grad() + loss_value: float = loss.item() + return loss_value + + def eval_step( + self: TrainerType, + engine: Engine, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + self.model.eval() + with torch.no_grad(): + y_pred, targets = self.get_predictions_and_targets(batch) + return y_pred, targets + + def predict(self: TrainerType, dataloader: DataLoader) -> State: + return self.valid_evaluator.run(dataloader) + + def fit(self: TrainerType, + train_loader: DataLoader, + val_loader: DataLoader, + epochs: int = 50) -> State: + log.info( + 'Trainer will run for\n' + f'model: {self.model}\n' + f'optimizer: {self.optimizer}\n' + f'loss: {self.loss_fn}') +# self.val_handler.attach(self.trainer, +# self.train_evaluator, +# train_loader, +# validation=False) + self.val_handler.attach(self.trainer, + self.valid_evaluator, + val_loader, + validation=True) + self.model.zero_grad() + self.trainer.run(train_loader, max_epochs=epochs) + best_score = (-self.early_stop.best_score + if self.early_stop + else self.valid_evaluator.state.metrics['loss']) + return best_score + + + + + def overfit_single_batch(self: TrainerType, + train_loader: DataLoader) -> State: + single_batch = [next(iter(train_loader))] + + if self.trainer.has_event_handler(self.val_handler, Events.EPOCH_COMPLETED): + self.trainer.remove_event_handler(self.val_handler, Events.EPOCH_COMPLETED) + +# self.val_handler.attach(self.trainer, +# self.train_evaluator, +# single_batch, # type: ignore +# validation=False) + out = self.trainer.run(single_batch, max_epochs=100) + return out + + def fit_debug(self: TrainerType, + train_loader: DataLoader, + val_loader: DataLoader) -> State: + train_loader = iter(train_loader) + train_subset = [next(train_loader), next(train_loader)] + val_loader = iter(val_loader) # type: ignore + val_subset = [next(val_loader), next(val_loader)] # type ignore + out = self.fit(train_subset, val_subset, epochs=6) # type: ignore + return out + + def _attach_checkpoint(self: TrainerType) -> TrainerType: + ckpt = { + 'model': self.model, + 'optimizer': self.optimizer + } + if self.checkpoint_dir is not None: + self.valid_evaluator.add_event_handler( + Events.COMPLETED, self.checkpoint, ckpt) + return self + + + def attach(self: TrainerType) -> TrainerType: + ra = RunningAverage(output_transform=lambda x: x) + ra.attach(self.trainer, "Train Loss") + self.pbar.attach(self.trainer, ['Train Loss']) +# self.val_pbar.attach(self.train_evaluator) + self.val_pbar.attach(self.valid_evaluator) + self.valid_evaluator.add_event_handler(Events.COMPLETED, + self.early_stop) + self = self._attach_checkpoint() + def graceful_exit(engine, e): + if isinstance(e, KeyboardInterrupt): + engine.terminate() + log.warn("CTRL-C caught. Exiting gracefully...") + else: + raise(e) + + self.trainer.add_event_handler(Events.EXCEPTION_RAISED, graceful_exit) + #self.train_evaluator.add_event_handler(Events.EXCEPTION_RAISED, + # graceful_exit) + self.valid_evaluator.add_event_handler(Events.EXCEPTION_RAISED, + graceful_exit) + return self + + +class AutoencoderTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + return inputs, inputs + + +class SequentialTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + +# import pdb; pdb.set_trace() + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + titles = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + number_of_sentences = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + length_of_sentences = to_device(batch[4], + device=self.device, + non_blocking=self.non_blocking) + title_lengths = to_device(batch[5], + device=self.device, + non_blocking=self.non_blocking) +# length_of_sentences = to_device(batch[5], +# device=self.device, +# non_blocking=self.non_blocking) + + return inputs, titles, targets, number_of_sentences, length_of_sentences, title_lengths + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, titles, targets, number_of_sentences, length_of_sentences, title_lengths = self.parse_batch(batch) + # import pdb; pdb.set_trace() + y_pred = self.model(inputs, number_of_sentences, length_of_sentences, titles, title_lengths) + return y_pred, targets + + +class Seq2seqTrainer(SequentialTrainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + lengths = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + return inputs, inputs, lengths + + +class TransformerTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + mask_inputs = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + mask_targets = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + return inputs, targets, mask_inputs, mask_targets + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, targets, mask_inputs, mask_targets = self.parse_batch(batch) + y_pred = self.model(inputs, + targets, + source_mask=mask_inputs, + target_mask=mask_targets) + targets = targets.view(-1) + y_pred = y_pred.view(targets.size(0), -1) + # TODO: BEAMSEARCH!! + return y_pred, targets From 1c5c4a4ccd54c83d5e89cad47051c90f808c5942 Mon Sep 17 00:00:00 2001 From: danaiksez Date: Sat, 14 Mar 2020 21:47:08 +0200 Subject: [PATCH 3/5] basic Hier model for client --- examples/basic_model_DRNN.py | 163 +++++++++++++++++ examples/therapy_title.py | 42 +++-- examples/therapy_unbalanced.py | 190 ++++++++++++++++++++ slp/data/collators.py | 8 +- slp/data/collators_title_touvlo.py | 84 +++++++++ slp/data/therapy.py | 1 + slp/data/therapy_title.py | 101 ++++++----- slp/modules/basic_model.py | 156 ++++++++++++++++ slp/modules/basic_model_DRNN.py | 199 +++++++++++++++++++++ slp/modules/drnn.py | 131 ++++++++++++++ slp/modules/helpers.py | 4 +- slp/trainer/trainer_title_no_validation.py | 54 +++++- 12 files changed, 1065 insertions(+), 68 deletions(-) create mode 100644 examples/basic_model_DRNN.py create mode 100644 examples/therapy_unbalanced.py create mode 100644 slp/data/collators_title_touvlo.py create mode 100644 slp/modules/basic_model.py create mode 100644 slp/modules/basic_model_DRNN.py create mode 100644 slp/modules/drnn.py diff --git a/examples/basic_model_DRNN.py b/examples/basic_model_DRNN.py new file mode 100644 index 0000000..378c9a2 --- /dev/null +++ b/examples/basic_model_DRNN.py @@ -0,0 +1,163 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators import SequenceClassificationCollator +from slp.data.therapy_title import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.basic_model_DRNN import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer.trainer_title_no_validation import SequentialTrainer + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device=DEVICE) + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=False, + num_workers=0, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + num_workers=0, + sampler=val_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.0005) + + metrics = { + 'accuracy': Accuracy(), + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir='../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 4 + batch_val = 4 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 4 + hidden_size = 300 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) + to_tensor = ToTensor(device=DEVICE) + + bio = PsychologicalDataset( + '../data/balanced_new_csv.csv', '../../../test_CEL/slp/data/psychotherapy/', + max_word_length, + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): + trainer = trainer_factory(embeddings, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/examples/therapy_title.py b/examples/therapy_title.py index d9963ea..c20b279 100644 --- a/examples/therapy_title.py +++ b/examples/therapy_title.py @@ -9,12 +9,12 @@ from torchvision.transforms import Compose from sklearn.model_selection import KFold -from slp.data.collators_title import SequenceClassificationCollator +from slp.data.collators import SequenceClassificationCollator from slp.data.therapy_title import PsychologicalDataset, TupleDataset from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken -from slp.modules.hier_att_net_title import HierAttNet +from slp.modules.basic_model import HierAttNet from slp.util.embeddings import EmbeddingsLoader -from slp.trainer.trainer_title import SequentialTrainer +from slp.trainer.trainer_title_no_validation import SequentialTrainer #DEVICE = 'cpu' DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' @@ -33,13 +33,13 @@ def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, b dataset, batch_size=batch_train, sampler=train_sampler, - drop_last=False, + drop_last=True, collate_fn=COLLATE_FN) val_loader = DataLoader( dataset, batch_size=batch_val, sampler=val_sampler, - drop_last=False, + drop_last=True, collate_fn=COLLATE_FN) return train_loader, val_loader @@ -57,6 +57,7 @@ def train_test_split(dataset, batch_train, batch_val, train_indices = indices[test_split:] val_indices = indices[:test_split] + import pdb; pdb.set_trace() return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) @@ -65,9 +66,9 @@ def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): for train_indices, val_indices in kfold.split(dataset): yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) -def trainer_factory(embeddings, idx2word, device=DEVICE): +def trainer_factory(embeddings, device=DEVICE): model = HierAttNet( - hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings, idx2word) + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) model = model.to(DEVICE) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters(), lr=0.0005) @@ -100,11 +101,12 @@ def trainer_factory(embeddings, idx2word, device=DEVICE): max_word_length = 150 #max length of each sentence (turn) - after padding num_classes = 2 batch_size = 8 - hidden_size = 380 + hidden_size = 300 epochs = 40 # loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) word2idx, idx2word, embeddings = loader.load() embeddings = torch.tensor(embeddings) @@ -115,7 +117,10 @@ def trainer_factory(embeddings, idx2word, device=DEVICE): to_tensor = ToTensor(device=DEVICE) bio = PsychologicalDataset( - '../data/balanced_new_csv.csv', '../../../test_CEL/slp/data/psychotherapy/', +# '../data/balanced_new_csv.csv', + '../../../test_dataset.csv', +# '../../../depressive_dataset.csv', + '../../../test_CEL/slp/data/psychotherapy/', max_word_length, text_transforms = Compose([ tokenizer, @@ -123,6 +128,20 @@ def trainer_factory(embeddings, idx2word, device=DEVICE): to_token_ids, to_tensor])) + de = 0 + nd = 0 + m = 0 +# for i, (t,x,feat, l) in enumerate(bio): +# m += 1 +# if (l==1): +# de += 1 +# else: +# nd += 1 + print(m) + print("----------------") + print(de) + print(nd) + print("----------------") @@ -130,7 +149,8 @@ def trainer_factory(embeddings, idx2word, device=DEVICE): cv_scores = [] import gc for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): - trainer = trainer_factory(embeddings, idx2word, device=DEVICE) +# import pdb; pdb.set_trace() + trainer = trainer_factory(embeddings, device=DEVICE) fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) cv_scores.append(fold_score) print("**********************") @@ -141,7 +161,7 @@ def trainer_factory(embeddings, idx2word, device=DEVICE): final_score = float(sum(cv_scores)) / len(cv_scores) else: train_loader, val_loader = train_test_split(bio, batch_train, batch_val) - trainer = trainer_factory(embeddings, idx2word, device=DEVICE) + trainer = trainer_factory(embeddings, device=DEVICE) final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) print(f'Final score: {final_score}') diff --git a/examples/therapy_unbalanced.py b/examples/therapy_unbalanced.py new file mode 100644 index 0000000..8b13ac9 --- /dev/null +++ b/examples/therapy_unbalanced.py @@ -0,0 +1,190 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy, Precision, Recall +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators import SequenceClassificationCollator +from slp.data.therapy_title import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.basic_model import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer.trainer_title_no_validation import SequentialTrainer + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device=DEVICE) + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=True, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + sampler=val_sampler, + drop_last=True, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + import pdb; pdb.set_trace() + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.0005) + + precision = Precision(average=False) + recall = Recall(average=False) + avg_prec = precision.mean() + avg_rec = recall.mean() + F1 = (precision * recall * 2 / (precision + recall + 1e-7)).mean() + metrics = { + 'accuracy': Accuracy(), + 'presicion': avg_prec, + 'recall': avg_rec, + 'f1': F1, + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir=None, # '../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 8 + batch_val = 8 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 8 + hidden_size = 300 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) + to_tensor = ToTensor(device=DEVICE) + + bio = PsychologicalDataset( +# '../data/balanced_new_csv.csv', +# '../../../test_dataset.csv', +# '../../../depressive_dataset.csv', + '../../../unbalanced_dataset.csv', + '../../../test_CEL/slp/data/psychotherapy/', + max_word_length, + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + de = 0 + nd = 0 + m = 0 + for i, (t,x,l) in enumerate(bio): + m += 1 + if (l==1): + de += 1 + else: + nd += 1 + print(m) + print("----------------") + print(de) + print(nd) + print("----------------") + + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): +# import pdb; pdb.set_trace() + trainer = trainer_factory(embeddings, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/slp/data/collators.py b/slp/data/collators.py index e38ae25..2a3711f 100644 --- a/slp/data/collators.py +++ b/slp/data/collators.py @@ -12,10 +12,11 @@ def __init__(self, pad_indx=0, device='cpu'): self.device = device def __call__(self, batch): - inputs, titles, targets = map(list, zip(*batch)) + inputs, titles, features, targets = map(list, zip(*batch)) + lengths = torch.tensor([len(s) for s in inputs], device=self.device) lengths_title = torch.tensor([len(t) for t in titles], device=self.device) - + # Pad and convert to tensor inputs = (pad_sequence(inputs, batch_first=True, @@ -27,7 +28,8 @@ def __call__(self, batch): .to(self.device)) targets = mktensor(targets, device=self.device, dtype=torch.long) - return inputs, titles, targets.to(self.device), lengths, lengths_title + features = mktensor(features, device=self.device) + return inputs, titles, features.to(self.device), targets.to(self.device), lengths, lengths_title class BertCollator(object): diff --git a/slp/data/collators_title_touvlo.py b/slp/data/collators_title_touvlo.py new file mode 100644 index 0000000..5bf2617 --- /dev/null +++ b/slp/data/collators_title_touvlo.py @@ -0,0 +1,84 @@ +import torch +from torch.nn.utils.rnn import pack_padded_sequence , pad_sequence + +from slp.modules.util import pad_mask, subsequent_mask +from slp.util import mktensor +from slp.data.therapy_title import pad_sequence as pad_sequence1 + +class SequenceClassificationCollator(object): + def __init__(self, pad_indx=0, device='cpu'): + self.pad_indx = pad_indx + self.device = device + + def __call__(self, batch): +# import pdb; pdb.set_trace() + padding_len=150 + inputs, titles, targets = map(list, zip(*batch)) + number_of_sentences = torch.tensor([len(s) for s in inputs], device=self.device) + length_of_sentences = ([torch.tensor([len(s) if len(s) mx: + mx = i[1] + features = [mean_lengths, turns_no, mx] + return (preprocessed_text, preprocessed_title, features, lab) diff --git a/slp/modules/basic_model.py b/slp/modules/basic_model.py new file mode 100644 index 0000000..3020d40 --- /dev/null +++ b/slp/modules/basic_model.py @@ -0,0 +1,156 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence + + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, hidden_size=300): + super(WordAttNet, self).__init__() + + self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) + + self.word = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + def forward(self, inputs, lengths, hidden_state, is_title=False): + output_emb = self.lookup(inputs) + +# import pdb; pdb.set_trace() + + output, lengths = self.pack(output_emb ,lengths) + f_output, h_output = self.gru(output.float(), hidden_state) + f_output = self.unpack(f_output, lengths) + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self, hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size, 300, bidirectional=True, batch_first=True) #changed hidden size + + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size + 3, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, features, lengths, hidden_state): +# import pdb; pdb.set_trace() + + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + + #titles = torch.unsqueeze(titles, dim=1) +# f_output = torch.cat((f_output,features), dim=1) + + + output = self.sent(f_output) + output = self.context(output) + + #features = torch.unsqueeze(features, dim=2) + + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + + output = torch.cat((output,features), dim=1) + output = self.fc(output).squeeze() + + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + + self.sent_att_net = SentAttNet(self.hidden_size, num_classes) + self.word_att_net_text = WordAttNet(dict_size, diction, hidden_size) + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, lengths, features, titles, title_lengths): + # inputs = (B, S, W) + text = inputs.permute(1, 0, 2) + all_word_lengths = [] + output_list_text = [] + for i in text: + + word_lengths = i.size(1) - (i==0).sum(dim=1) + if 0 in word_lengths: + for k in range(0, inputs.size()[0]): + if word_lengths[k] == 0: + word_lengths[k] = 1 +# all_word_lengths.append(word_lengths) + output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, + self.word_hidden_state, + is_title=False) #[8,600] + output_list_text.append(output_text) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + + + #output_title, self.word_hidden_state = self.word_att_net_text(titles, # try without title + # title_lengths, + # self.word_hidden_state, + # self.idx2word, + # self.lex_size, + # is_title=True) #[8,600] + #self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + # output_list_text = (S, B, 600) + # output_list_text = (S, B, 760) +# import pdb; pdb.set_trace() + + output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, features, lengths, self.sent_hidden_state) + self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/modules/basic_model_DRNN.py b/slp/modules/basic_model_DRNN.py new file mode 100644 index 0000000..0bff7f0 --- /dev/null +++ b/slp/modules/basic_model_DRNN.py @@ -0,0 +1,199 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence +from slp.modules.drnn import DRNN + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) +2 + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, hidden_size=300): + super(WordAttNet, self).__init__() + + n_layers = 2 + embedding_size = 300 + self.model = DRNN(embedding_size, hidden_size, n_layers, cell_type='GRU') + + + + + #self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) + + self.word = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + def forward(self, inputs, hidden_state, is_title=False): + output_emb = self.lookup(inputs) + + import pdb; pdb.set_trace() + +# output, lengths = self.pack(output_emb ,lengths) + f_output, h_output = self.model(output_emb.float(), hidden_state) +# f_output = self.unpack(f_output, lengths) + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self, hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size, 300, bidirectional=True, batch_first=True) #changed hidden size + + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): +# import pdb; pdb.set_trace() + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + + #titles = torch.unsqueeze(titles, dim=1) + #f_output = torch.cat((f_output,titles), dim=1) + + + output = self.sent(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + output = self.fc(output).squeeze() + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + + n_layers = 2 + n_classes = 2 + embedding_size = 300 + + self.model = DRNN(embedding_size, hidden_size, n_layers, cell_type='GRU', batch_first=True) + + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + + self.linear = nn.Linear(hidden_size, n_classes) + + self.sent = nn.Linear(hidden_size, hidden_size) + self.context = nn.Linear(hidden_size, 1, bias=False) + + + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, lengths, titles, title_lengths): + # inputs = (B, S, W) + text = inputs.permute(1, 0, 2) + all_word_lengths = [] + output_list_text = [] + + import pdb; pdb.set_trace() + batch_size = inputs.size(0) + num_sentences = inputs.size(1) + padded = inputs.size(2) + #inputs = inputs.view(batch_size * num_sentences, -1) + + inputs = inputs.view(batch_size, num_sentences * padded) + + output_emb = self.lookup(inputs) + layer_outputs, self.word_hidden_state = self.model(output_emb.float()) + #self.word_hidden_state) + output_list_text.append(layer_outputs) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + + + + + + preds = [] + for i in range(batch_size): + output = self.sent(layer_outputs[i]) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + pred = self.linear(output) + preds.append(pred) +# pred = self.linear(layer_outputs[-1]) + output = preds + + + +#----- +# for i in text: + +# word_lengths = i.size(1) - (i==0).sum(dim=1) +# if 0 in word_lengths: +# for k in range(0, inputs.size()[0]): +# if word_lengths[k] == 0: +# word_lengths[k] = 1 +### all_word_lengths.append(word_lengths) +# output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, +# self.word_hidden_state, +# is_title=False) #[8,600] +# output_list_text.append(output_text) +# self.word_hidden_state = repackage_hidden(self.word_hidden_state) +#----- + + + + # output_list_text = (S, B, 600) + # output_list_text = (S, B, 760) + import pdb; pdb.set_trace() + + +# output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) +# output, self.sent_hidden_state = self.sent_att_net(output_list_text, lengths, self.sent_hidden_state) +# self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/modules/drnn.py b/slp/modules/drnn.py new file mode 100644 index 0000000..886cb33 --- /dev/null +++ b/slp/modules/drnn.py @@ -0,0 +1,131 @@ +import torch +import torch.nn as nn + + +use_cuda = torch.cuda.is_available() + + +class DRNN(nn.Module): + + def __init__(self, n_input, n_hidden, n_layers, dropout=0, cell_type='GRU', batch_first=False): + super(DRNN, self).__init__() + + self.dilations = [2 ** i for i in range(n_layers)] + self.cell_type = cell_type + self.batch_first = batch_first + + layers = [] + if self.cell_type == "GRU": + cell = nn.GRU + elif self.cell_type == "RNN": + cell = nn.RNN + elif self.cell_type == "LSTM": + cell = nn.LSTM + else: + raise NotImplementedError + + for i in range(n_layers): + if i == 0: + c = cell(n_input, n_hidden, dropout=dropout) + else: + c = cell(n_hidden, n_hidden, dropout=dropout) + layers.append(c) + self.cells = nn.Sequential(*layers) + + def forward(self, inputs, hidden=None): + +# import pdb; pdb.set_trace() + + if self.batch_first: + inputs = inputs.transpose(0, 1) + outputs = [] + for i, (cell, dilation) in enumerate(zip(self.cells, self.dilations)): + if hidden is None: + inputs, _ = self.drnn_layer(cell, inputs, dilation) + else: + inputs, hidden[i] = self.drnn_layer(cell, inputs, dilation, hidden[i]) + + outputs.append(inputs[-dilation:]) + + if self.batch_first: + inputs = inputs.transpose(0, 1) + return inputs, outputs + + def drnn_layer(self, cell, inputs, rate, hidden=None): + n_steps = len(inputs) + batch_size = inputs[0].size(0) + hidden_size = cell.hidden_size + + inputs, _ = self._pad_inputs(inputs, n_steps, rate) + dilated_inputs = self._prepare_inputs(inputs, rate) + + if hidden is None: + dilated_outputs, hidden = self._apply_cell(dilated_inputs, cell, batch_size, rate, hidden_size) + else: + hidden = self._prepare_inputs(hidden, rate) + dilated_outputs, hidden = self._apply_cell(dilated_inputs, cell, batch_size, rate, hidden_size, hidden=hidden) + + splitted_outputs = self._split_outputs(dilated_outputs, rate) + outputs = self._unpad_outputs(splitted_outputs, n_steps) + + return outputs, hidden + + def _apply_cell(self, dilated_inputs, cell, batch_size, rate, hidden_size, hidden=None): + if hidden is None: + if self.cell_type == 'LSTM': + c, m = self.init_hidden(batch_size * rate, hidden_size) + hidden = (c.unsqueeze(0), m.unsqueeze(0)) + else: + hidden = self.init_hidden(batch_size * rate, hidden_size).unsqueeze(0) + + dilated_outputs, hidden = cell(dilated_inputs, hidden) + + return dilated_outputs, hidden + + def _unpad_outputs(self, splitted_outputs, n_steps): + return splitted_outputs[:n_steps] + + def _split_outputs(self, dilated_outputs, rate): + batchsize = dilated_outputs.size(1) // rate + + blocks = [dilated_outputs[:, i * batchsize: (i + 1) * batchsize, :] for i in range(rate)] + + interleaved = torch.stack((blocks)).transpose(1, 0).contiguous() + interleaved = interleaved.view(dilated_outputs.size(0) * rate, + batchsize, + dilated_outputs.size(2)) + return interleaved + + def _pad_inputs(self, inputs, n_steps, rate): + is_even = (n_steps % rate) == 0 + + if not is_even: + dilated_steps = n_steps // rate + 1 + + zeros_ = torch.zeros(dilated_steps * rate - inputs.size(0), + inputs.size(1), + inputs.size(2)) + if use_cuda: + zeros_ = zeros_.cuda() + + inputs = torch.cat((inputs, zeros_)) + else: + dilated_steps = n_steps // rate + + return inputs, dilated_steps + + def _prepare_inputs(self, inputs, rate): + dilated_inputs = torch.cat([inputs[j::rate, :, :] for j in range(rate)], 1) + return dilated_inputs + + def init_hidden(self, batch_size, hidden_dim): + hidden = torch.zeros(batch_size, hidden_dim) + if use_cuda: + hidden = hidden.cuda() + if self.cell_type == "LSTM": + memory = torch.zeros(batch_size, hidden_dim) + if use_cuda: + memory = memory.cuda() + return (hidden, memory) + else: + return hidden diff --git a/slp/modules/helpers.py b/slp/modules/helpers.py index e476954..908fa83 100644 --- a/slp/modules/helpers.py +++ b/slp/modules/helpers.py @@ -9,9 +9,9 @@ def __init__(self, batch_first=True): super(PadPackedSequence, self).__init__() self.batch_first = batch_first - def forward(self, x, lengths, max_length): + def forward(self, x, lengths): # import pdb; pdb.set_trace() -# max_length = lengths.max().item() + max_length = lengths.max().item() x, _ = pad_packed_sequence( x, batch_first=self.batch_first, total_length=max_length) return x diff --git a/slp/trainer/trainer_title_no_validation.py b/slp/trainer/trainer_title_no_validation.py index c9fef5f..bf4a519 100644 --- a/slp/trainer/trainer_title_no_validation.py +++ b/slp/trainer/trainer_title_no_validation.py @@ -12,6 +12,7 @@ from torch.nn.modules.loss import _Loss from torch.utils.data import DataLoader +from sklearn.metrics import f1_score from typing import cast, List, Optional, Tuple, TypeVar from slp.util import types from slp.util.parallel import DataParallelModel, DataParallelCriterion @@ -154,8 +155,9 @@ def train_step(self: TrainerType, engine: Engine, batch: List[torch.Tensor]) -> float: self.model.train() - #import pdb; pdb.set_trace() +# import pdb; pdb.set_trace() y_pred, targets = self.get_predictions_and_targets(batch) + loss = self.loss_fn(y_pred, targets.long()) # type: ignore if self.parallel: loss = loss.mean() @@ -173,7 +175,12 @@ def eval_step( batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: self.model.eval() with torch.no_grad(): +# import pdb; pdb.set_trace() y_pred, targets = self.get_predictions_and_targets(batch) + +# f1 = f1_score(targets, y_pred, average='macro') +# print(f1) + return y_pred, targets def predict(self: TrainerType, dataloader: DataLoader) -> State: @@ -275,7 +282,7 @@ def parse_batch( return inputs, inputs -class SequentialTrainer(Trainer): +class SequentialTrainerTouvlo(Trainer): def parse_batch( self, batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: @@ -299,9 +306,6 @@ def parse_batch( title_lengths = to_device(batch[5], device=self.device, non_blocking=self.non_blocking) -# length_of_sentences = to_device(batch[5], -# device=self.device, -# non_blocking=self.non_blocking) return inputs, titles, targets, number_of_sentences, length_of_sentences, title_lengths @@ -309,11 +313,49 @@ def get_predictions_and_targets( self, batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: inputs, titles, targets, number_of_sentences, length_of_sentences, title_lengths = self.parse_batch(batch) - # import pdb; pdb.set_trace() + #import pdb; pdb.set_trace() y_pred = self.model(inputs, number_of_sentences, length_of_sentences, titles, title_lengths) return y_pred, targets + +class SequentialTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + +# import pdb; pdb.set_trace() + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + titles = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + features = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + lengths = to_device(batch[4], + device=self.device, + non_blocking=self.non_blocking) + title_lengths = to_device(batch[5], + device=self.device, + non_blocking=self.non_blocking) + + return inputs, titles, features, targets, lengths, title_lengths + + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + + # import pdb; pdb.set_trace() + inputs, titles, features, targets, lengths, title_lengths = self.parse_batch(batch) + y_pred = self.model(inputs, lengths, features, titles, title_lengths) + return y_pred, targets + class Seq2seqTrainer(SequentialTrainer): def parse_batch( self, From fdb94cbf65a3413620c47266b769ba619fc56cd1 Mon Sep 17 00:00:00 2001 From: danaiksez Date: Wed, 1 Apr 2020 18:10:02 +0300 Subject: [PATCH 4/5] kernel lexicon + minors --- examples/kernel-lexicon-algorithm.py | 59 +++++++++ examples/therapy_title.py | 32 +++-- examples/therapy_unbalanced.py | 15 ++- slp/data/seed_words | 28 ++++ slp/data/therapy_lexicon.py | 125 ++++++++++++++++++ slp/data/therapy_title.py | 47 ++++--- slp/modules/basic_model.py | 29 ++-- .../hier_att_net_title_attentional_embed.py | 1 - slp/trainer/trainer_title_no_validation.py | 3 + 9 files changed, 284 insertions(+), 55 deletions(-) create mode 100644 examples/kernel-lexicon-algorithm.py create mode 100644 slp/data/seed_words create mode 100644 slp/data/therapy_lexicon.py diff --git a/examples/kernel-lexicon-algorithm.py b/examples/kernel-lexicon-algorithm.py new file mode 100644 index 0000000..b9959dd --- /dev/null +++ b/examples/kernel-lexicon-algorithm.py @@ -0,0 +1,59 @@ +import padasip as pa +import numpy as np +import matplotlib.pylab as plt + +from torch.utils.data import DataLoader, SubsetRandomSampler + +#from slp.data.diction import seeds_diction +from slp.util.embeddings import EmbeddingsLoader +from slp.data.transforms import SpacyTokenizer +from slp.data.therapy_lexicon import PsychologicalDataset, TupleDataset +from sklearn.metrics.pairwise import cosine_similarity + +DATASET = '../../../whole-dataset.csv' + + +if __name__ == '__main__': + + Kseeds = 200 + max_word_length = 150 +# seed_set = list(seeds_diction.keys()) + + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + + tokenizer = SpacyTokenizer() + bio = PsychologicalDataset( + DATASET, + '../../../test_CEL/slp/data/psychotherapy', + max_word_length, + text_transforms = tokenizer) + + corpus = [] + for i, (text, title, lab) in enumerate(bio): + corpus.extend(text) + + + + import pdb; pdb.set_trace() + corpus = np.unique(corpus) + vocabulary = [word for word in corpus if word not in seed_set] + Nwords = len(vocabulary) + + #x-input matrix initialization + x = np.zeros(Kseeds, Nwords) + i = 0 + for word in vocabulary: + wv = word2idx[word] + j = 0 + for seed in seed_set: + ws = word2idx[seed] + d = cosine_similarity(wv, ws) + x[i][j] = d * seeds_diction[seed] + j += 1 + i += 1 + + #filter definition + f = pa.filters.FilterLMS(n=Nwords, mu=0.01, w="random") + mul = np.matmul(x, d) + f.run(mul, x) diff --git a/examples/therapy_title.py b/examples/therapy_title.py index c20b279..0d105fa 100644 --- a/examples/therapy_title.py +++ b/examples/therapy_title.py @@ -4,7 +4,7 @@ import torch.nn as nn from torch.optim import Adam -from ignite.metrics import Loss, Accuracy +from ignite.metrics import Loss, Accuracy, Precision, Recall from torch.utils.data import DataLoader, SubsetRandomSampler from torchvision.transforms import Compose from sklearn.model_selection import KFold @@ -57,7 +57,7 @@ def train_test_split(dataset, batch_train, batch_val, train_indices = indices[test_split:] val_indices = indices[:test_split] - import pdb; pdb.set_trace() + #import pdb; pdb.set_trace() return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) @@ -72,9 +72,17 @@ def trainer_factory(embeddings, device=DEVICE): model = model.to(DEVICE) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters(), lr=0.0005) + precision = Precision(average=False) + recall = Recall(average=False) + avg_prec = precision.mean() + avg_rec = recall.mean() + F1 = (precision * recall * 2/(precision + recall + 1e-7)).mean() metrics = { 'accuracy': Accuracy(), + 'precision': avg_prec, + 'recall': avg_rec, + 'f1': F1, 'loss': Loss(criterion) } @@ -83,6 +91,7 @@ def trainer_factory(embeddings, device=DEVICE): optimizer, checkpoint_dir='../checkpoints' if not DEBUG else None, metrics=metrics, + model_checkpoint = '../experiment_model.best.pth', non_blocking=True, patience=10, loss_fn=criterion, @@ -118,8 +127,9 @@ def trainer_factory(embeddings, device=DEVICE): bio = PsychologicalDataset( # '../data/balanced_new_csv.csv', - '../../../test_dataset.csv', +# '../../../test_dataset.csv', # '../../../depressive_dataset.csv', + '../../../whole-dataset.csv', '../../../test_CEL/slp/data/psychotherapy/', max_word_length, text_transforms = Compose([ @@ -131,25 +141,23 @@ def trainer_factory(embeddings, device=DEVICE): de = 0 nd = 0 m = 0 -# for i, (t,x,feat, l) in enumerate(bio): -# m += 1 -# if (l==1): -# de += 1 -# else: -# nd += 1 + for i, (t,x,feat, l) in enumerate(bio): + m += 1 + if (l==1): + de += 1 + else: + nd += 1 print(m) print("----------------") print(de) print(nd) print("----------------") - - if KFOLD: cv_scores = [] import gc for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): -# import pdb; pdb.set_trace() + #import pdb; pdb.set_trace() trainer = trainer_factory(embeddings, device=DEVICE) fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) cv_scores.append(fold_score) diff --git a/examples/therapy_unbalanced.py b/examples/therapy_unbalanced.py index 8b13ac9..5b73765 100644 --- a/examples/therapy_unbalanced.py +++ b/examples/therapy_unbalanced.py @@ -128,7 +128,8 @@ def trainer_factory(embeddings, device=DEVICE): # '../data/balanced_new_csv.csv', # '../../../test_dataset.csv', # '../../../depressive_dataset.csv', - '../../../unbalanced_dataset.csv', +# '../../../unbalanced_dataset.csv', + '../../../whole-dataset.csv', '../../../test_CEL/slp/data/psychotherapy/', max_word_length, text_transforms = Compose([ @@ -140,12 +141,12 @@ def trainer_factory(embeddings, device=DEVICE): de = 0 nd = 0 m = 0 - for i, (t,x,l) in enumerate(bio): - m += 1 - if (l==1): - de += 1 - else: - nd += 1 +# for i, (t,x,f,l) in enumerate(bio): +# m += 1 +# if (l==1): +# de += 1 +# else: +# nd += 1 print(m) print("----------------") print(de) diff --git a/slp/data/seed_words b/slp/data/seed_words new file mode 100644 index 0000000..d7ba38e --- /dev/null +++ b/slp/data/seed_words @@ -0,0 +1,28 @@ +depressed = ['temper' 'pulled' 'guidance' 'required' 'agree' 'motivation' + 'field' 'scary' 'tough' 'school' 'vacation' 'responsible' 'study' + 'crossroads' 'wink' 'competitive' 'groove' 'anxiousness' + 'procrastination' 'established' 'usually' 'knowing' 'technically' + 'med' 'putting' 'library' 'prepare' 'overcome' 'read' 'counselor' + 'may' 'education' 'routine' 'awhile' 'studying' 'depressing' 'comes' + 'pushing' 'research' 'reality' 'alright' 'brothers' 'road' + 'responsibility' 'frustrating' 'books' 'problem' 'listening' 'girlfriend' + 'fast' 'thoughts' 'mentioned' 'fear' 'reading' 'helping' 'older' 'write' + 'answer' 'monday' 'friday' 'lose' 'means' 'dad' 'phone' 'move' 'control' + 'mom' 'especially' 'important' 'hour' 'yesterday' 'close' 'believe' 'sit' + 'job' 'gets' 'making' 'called' 'sleep' 'weeks' 'real' 'fame', 'familiar' 'feared' 'family'] + +not-depressed = ['jenny' 'concord' 'glasses' 'tense' 'jabs' 'booked' 'upsets' 'thrilled' + 'frustrates' 'anxious' 'weekend' 'subtle' 'paying' 'incredibly' + 'denying' 'sat' 'expensive' 'idea' 'wrong' 'afford' + 'psychoanalysts' 'mail' 'metaphoric' 'transactional' 'costly' 'firms' 'low' 'tired' 'reality' + 'budgeted' 'pauses' 'taxing' 'excited' 'kids' 'symbolize' 'relationships' + 'money' 'snappy' 'provoking' 'bonus' 'upfront' 'purchased' 'wrote' + 'psychoanalytic' 'mailing' 'compelled' 'guilty' 'unclear' 'bump' 'wins' 'great' 'job' + 'outsider' 'problematic' 'substantial' 'agrees' + 'conversation' 'boundaries' 'pro' 'lease' 'fee' + 'wanting' 'comfortable' 'pretending' 'mad' 'begins' 'timing' 'arent' + 'practical' 'functions' 'involved' 'detail' 'pay' 'deals' 'consequences' + 'therapists' 'referring' 'desperate' 'engaged' 'cousins' 'relieved' + ] + + diff --git a/slp/data/therapy_lexicon.py b/slp/data/therapy_lexicon.py new file mode 100644 index 0000000..623c827 --- /dev/null +++ b/slp/data/therapy_lexicon.py @@ -0,0 +1,125 @@ +import pandas as pd +import os +import csv + +from html.parser import HTMLParser +from sklearn.preprocessing import MultiLabelBinarizer +from torch.utils.data import Dataset +from itertools import groupby + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.strict = False + self.convert_charrefs= True + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def get_data(self): + return ''.join(self.fed) + +def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() + +class TupleDataset(Dataset): + def __init__(self, data): + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + sample, label = self.data[idx] + return sample, label + +class PsychologicalDataset(Dataset): + def __init__(self, csv_file, root_dir, max_word_len=25, text_transforms=None): + self._file = csv_file + self.root_dir = root_dir + self.max_word_len = max_word_len + self.transforms = text_transforms + self.transcript, self.label, self.metadata, self.title = self.get_files_labels_metadata(self.root_dir, self._file) + self.patient_turns = ['FEMALE CLIENT', 'MALE CLIENT', 'Audience', 'CLIENT','PT','PATIENT','CL','Client','Danny','Juan', 'MAN', + 'PARTICIPANT', 'CG', 'MAN', 'RESPONDENT','F','Angie','Jeff', 'Bill', 'Jim', 'Leah', 'Kelly', 'MRS. NAVNOR', + 'MR. NAVNOR', 'MICHELLE', 'Phil', 'FEMALE PARTICIPANT', 'Mom', 'Nicole', 'LINDA', + 'MALE PARTICIPANT', 'Blake', 'M', 'Claudette', 'MR. VAC', 'Marie', 'Robin', 'Mike', 'Gina', 'FEMALE', 'LORI' + ,'Joshua', 'Shayla', 'Greg', 'Barbara', 'MARGE', 'ANN LARKIN', 'EDWARD', 'Mark', 'PATiENT'] + self.therapist_turns = ['CONNIRAE', 'ANALYST', 'THERAPIST','COUNSELOR','DR','M','Therapist','Marlatt', 'Lazarus','INTERVIEWER', + 'TH', 'Johnson', 'Scharff', 'T', 'Counselor', 'Wubbolding', 'DR. WARKENTIN', 'Bugental', 'Powers', 'Koocher', + 'Dr. Sklare', 'BECKER', 'Hardy', 'MODERATOR', 'Masek', 'VIRGINIA','MODERATOR', 'Oaklander', 'McCrady', + 'Bugental', 'Krumboltz', 'Miller', 'ANDREAS', 'Kottman', 'Utigaard', 'Wubbolding', 'Carlson', 'JOSH LOMAN', + 'Zweben'] + + + def get_files_labels_metadata(self, root_dir, _file): + included_cols = [1,11,12,13,14,15,16,20,22] + #included_cols_names = ['file name', 'session title', 'client gender', + # 'client age', 'client marital status', + # 'client sexual orientation', 'therapist gender', + # 'therapist experience', 'psych. subject', 'therapies'] + + transcripts = [] + labels = [] + title = [] + metadata = [] + + rows = [] +# _file = "../data/balanced_new_csv.csv" + with open(_file) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + rows.append(list(row)) + + for i in range(len(rows)): + labels.append(rows[i][21]) + title.append(rows[i][5]) + metadata.append([rows[i][y] for y in included_cols]) + + for i in range(len(rows)): + f = int(float(rows[i][1])) + filename = str(os.path.join(self.root_dir, str(f))) + '.txt' + fp = open(filename,'r+') + transcript = fp.read() + transcripts.append(transcript) + + return transcripts, labels, metadata, title + + + def __len__(self): + return len(self.label) + + + def __getitem__(self, idx): + preprocessed_text = self.transcript[idx] + label = self.label[idx].split("; ") + title = self.title[idx] + metadata = self.metadata[idx] + + mean_length = 0 + import re + if self.transforms is not None: + lista = [] + turns = [] + total_turns = [] + p = strip_tags(preprocessed_text) + p = p.split("\n") + p1 = [x for x in p if x!=''] +# import pdb; pdb.set_trace() + for i in p1: + i = i.split(":") + if len(i) >= 2: + lista.extend(self.transforms(i[1])) + + if len(lista) == 0: + import pdb; pdb.set_trace() + + preprocessed_text = lista + preprocessed_title = self.transforms(title) + + lab = int("Depressive disorder" in metadata[7] or "Depressive disorder" in label + or "Depression (emotion)" in label or "Depression (eotion)" in metadata[7]) + + return (preprocessed_text, preprocessed_title, lab) + diff --git a/slp/data/therapy_title.py b/slp/data/therapy_title.py index b93ee4d..8ee9e8f 100644 --- a/slp/data/therapy_title.py +++ b/slp/data/therapy_title.py @@ -82,10 +82,13 @@ def __init__(self, csv_file, root_dir, max_word_len=25, text_transforms=None): self.patient_turns = ['FEMALE CLIENT', 'MALE CLIENT', 'Audience', 'CLIENT','PT','PATIENT','CL','Client','Danny','Juan', 'MAN', 'PARTICIPANT', 'CG', 'MAN', 'RESPONDENT','F','Angie','Jeff', 'Bill', 'Jim', 'Leah', 'Kelly', 'MRS. NAVNOR', 'MR. NAVNOR', 'MICHELLE', 'Phil', 'FEMALE PARTICIPANT', 'Mom', 'Nicole', 'LINDA', - 'MALE PARTICIPANT', 'Blake', 'M'] + 'MALE PARTICIPANT', 'Blake', 'M', 'Claudette', 'MR. VAC', 'Marie', 'Robin', 'Mike', 'Gina', 'FEMALE', 'LORI' + ,'Joshua', 'Shayla', 'Greg', 'Barbara', 'MARGE', 'ANN LARKIN', 'EDWARD', 'Mark', 'PATiENT'] self.therapist_turns = ['CONNIRAE', 'ANALYST', 'THERAPIST','COUNSELOR','DR','M','Therapist','Marlatt', 'Lazarus','INTERVIEWER', 'TH', 'Johnson', 'Scharff', 'T', 'Counselor', 'Wubbolding', 'DR. WARKENTIN', 'Bugental', 'Powers', 'Koocher', - 'Dr. Sklare', 'BECKER', 'Hardy', 'MODERATOR', 'Masek', 'VIRGINIA','MODERATOR', 'Oaklander'] + 'Dr. Sklare', 'BECKER', 'Hardy', 'MODERATOR', 'Masek', 'VIRGINIA','MODERATOR', 'Oaklander', 'McCrady', + 'Bugental', 'Krumboltz', 'Miller', 'ANDREAS', 'Kottman', 'Utigaard', 'Wubbolding', 'Carlson', 'JOSH LOMAN', + 'Zweben'] def get_files_labels_metadata(self, root_dir, _file): @@ -141,29 +144,31 @@ def __getitem__(self, idx): p = strip_tags(preprocessed_text) p = p.split("\n") p1 = [x for x in p if x!=''] +# import pdb; pdb.set_trace() for i in p1: i = i.split(":") # if any(c in i[0] for c in self.therapist_turns): -# import pdb; pdb.set_trace() if len(i) != 1 and not '' in i: - if (i[0] in self.patient_turns): - turns.append(i[0]) - total_turns.append(0) - lista.append(self.text_transforms(i[1])) - mean_length = mean_length + len(i[1]) - - elif (i[0] not in self.therapist_turns): - match = re.match(r"([a-z]+)([0-9]+)", i[0], re.I) or re.match(r"([a-z]+)( )([0-9]+)", i[0], re.I) - if match: - items = match.groups() - if ((items[0] in self.patient_turns) or (i[0] in self.therapist_turns)): - turns.append(i[0]) - total_turns.append(0) - lista.append(self.text_transforms(i[1])) - mean_length = mean_length + len(i[1]) - - elif (i[0] in self.therapist_turns): - total_turns.append(1) + s = self.text_transforms(i[1]) + if len(s) >= 5: + if (i[0] in self.patient_turns): + turns.append(i[0]) + total_turns.append(0) + lista.append(s) + mean_length = mean_length + len(i[1]) + + elif (i[0] not in self.therapist_turns): + match = re.match(r"([a-z]+)([0-9]+)", i[0], re.I) or re.match(r"([a-z]+)( )([0-9]+)", i[0], re.I) + if match: + items = match.groups() + if ((items[0] in self.patient_turns) or (i[0] in self.therapist_turns)): + turns.append(i[0]) + total_turns.append(0) + lista.append(s) + mean_length = mean_length + len(i[1]) + + elif (i[0] in self.therapist_turns): + total_turns.append(1) if len(lista) == 0: import pdb; pdb.set_trace() diff --git a/slp/modules/basic_model.py b/slp/modules/basic_model.py index 3020d40..0b01a86 100644 --- a/slp/modules/basic_model.py +++ b/slp/modules/basic_model.py @@ -59,21 +59,21 @@ def __init__(self, hidden_size=300, num_classes=0): self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) self.context = nn.Linear(2 * hidden_size, 1, bias=False) - self.fc = nn.Linear(2 * hidden_size + 3, num_classes) + self.fc = nn.Linear(2 * hidden_size, num_classes) self.pack = PackSequence(batch_first=True) self.unpack = PadPackedSequence(batch_first=True) - def forward(self, inputs, features, lengths, hidden_state): + def forward(self, inputs, features, lengths, titles, hidden_state): # import pdb; pdb.set_trace() f_output, lengths = self.pack(inputs, lengths) f_output, h_output = self.gru(f_output, hidden_state) f_output = self.unpack(f_output, lengths) - #titles = torch.unsqueeze(titles, dim=1) -# f_output = torch.cat((f_output,features), dim=1) + titles = torch.unsqueeze(titles, dim=1) + f_output = torch.cat((f_output,titles), dim=1) output = self.sent(f_output) @@ -84,7 +84,7 @@ def forward(self, inputs, features, lengths, hidden_state): output = F.softmax(output, dim=1) output = (f_output * output).sum(1) - output = torch.cat((output,features), dim=1) +# output = torch.cat((output,features), dim=1) output = self.fc(output).squeeze() return output, h_output @@ -120,6 +120,9 @@ def forward(self, inputs, lengths, features, titles, title_lengths): text = inputs.permute(1, 0, 2) all_word_lengths = [] output_list_text = [] + +# import pdb; pdb.set_trace() + for i in text: word_lengths = i.size(1) - (i==0).sum(dim=1) @@ -136,20 +139,18 @@ def forward(self, inputs, lengths, features, titles, title_lengths): - #output_title, self.word_hidden_state = self.word_att_net_text(titles, # try without title - # title_lengths, - # self.word_hidden_state, - # self.idx2word, - # self.lex_size, - # is_title=True) #[8,600] - #self.word_hidden_state = repackage_hidden(self.word_hidden_state) - + output_title, self.word_hidden_state = self.word_att_net_text(titles, ### title + title_lengths, + self.word_hidden_state, + is_title=True) #[8,600] + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + # output_list_text = (S, B, 600) # output_list_text = (S, B, 760) # import pdb; pdb.set_trace() output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) - output, self.sent_hidden_state = self.sent_att_net(output_list_text, features, lengths, self.sent_hidden_state) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, features, lengths, output_title, self.sent_hidden_state) self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) return output diff --git a/slp/modules/hier_att_net_title_attentional_embed.py b/slp/modules/hier_att_net_title_attentional_embed.py index 32086f7..199b2aa 100644 --- a/slp/modules/hier_att_net_title_attentional_embed.py +++ b/slp/modules/hier_att_net_title_attentional_embed.py @@ -9,7 +9,6 @@ DEVICE = 'cpu' #DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' - def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" diff --git a/slp/trainer/trainer_title_no_validation.py b/slp/trainer/trainer_title_no_validation.py index bf4a519..ed38ed0 100644 --- a/slp/trainer/trainer_title_no_validation.py +++ b/slp/trainer/trainer_title_no_validation.py @@ -54,9 +54,12 @@ def __init__(self: TrainerType, self.accumulation_steps = accumulation_steps self.checkpoint_dir = checkpoint_dir +# import pdb; pdb.set_trace() + model_checkpoint = self._check_checkpoint(model_checkpoint) optimizer_checkpoint = self._check_checkpoint(optimizer_checkpoint) + self.model = cast(nn.Module, from_checkpoint( model_checkpoint, model, map_location=torch.device('cpu'))) self.model = self.model.type(dtype).to(device) From 87772cc598592b9145a095d03558a5d731893576 Mon Sep 17 00:00:00 2001 From: danaiksez Date: Wed, 1 Apr 2020 23:19:09 +0300 Subject: [PATCH 5/5] seed words +ratings --- slp/data/seed_words.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 slp/data/seed_words.py diff --git a/slp/data/seed_words.py b/slp/data/seed_words.py new file mode 100644 index 0000000..84c4f23 --- /dev/null +++ b/slp/data/seed_words.py @@ -0,0 +1,16 @@ +depressed = {'temper':0, 'pulled':1 ,'guidance':0, 'required':0, 'agree':-1, 'motivation':-1, 'field':0, 'scary':1, 'tough':1, 'school':0, + 'vacation':-1, 'responsible':-1, 'study':0, 'crossroads':0??, 'wink':-1, 'competitive':-1, 'anxiousness':1, 'procrastination':1, + 'established':0, 'usually':0, 'knowing':0??, 'technically':0, 'med':0??, 'putting':0, 'library':0, 'prepare':-1, 'overcome':-1, 'read':0, + 'counselor':0, 'may':0, 'education':0, 'routine':0, 'studying':0, 'depressing':1, 'comes':0, 'pushing':1, 'research':0, + 'reality':0, 'alright':-1, 'brothers':0, 'road':0, 'responsibility':-1??, 'frustrating':1, 'books':0, 'problem':1, 'listening':0, + 'girlfriend':0, 'fast':0, 'thoughts':???, 'mentioned':0, 'fear':1, 'reading':0, 'helping':-1, 'older':0?, 'write':0, 'answer':0, +'monday':1, 'friday':-1, 'lose':1, 'means':0, 'dad':0, 'phone':0, 'move':-1, 'control':1, 'mom':0, 'especially':0, 'important':-1, 'hour':0, +'yesterday':0, 'close':-1, 'believe':0, 'sit':0, 'job':0, 'gets':-1, 'making':-1, 'called':0, 'sleep':0, 'weeks':0, 'real':-1, 'fame':-1, 'familiar':-1, 'feared':1, 'family':0} + +notdepressed = {'concord':-1, 'tense':1, 'jabs':1, 'booked':-1, 'upsets':1, 'thrilled':-1, 'frustrates':1, 'anxious':1, 'weekend':-1, 'subtle':-1, 'paying':1, 'incredibly':-1, 'denying':1, 'sat':0, 'expensive':1, 'idea':0, 'wrong':1, 'afford':-1, 'psychoanalysts':0, +'mail':0, 'metaphoric':0, 'transactional':0, 'costly':1, 'firms':0, 'low':1, 'tired':1, 'budgeted':-1, 'pauses':1, 'taxing':1, +'excited':1, 'kids':0, 'symbolize':0, 'relationships':0, 'money':0, 'snappy':-1, 'provoking':1, 'bonus':-1, 'upfront':-1, 'purchased':-1, +'wrote':-1, 'psychoanalytic':0, 'compelled':1, 'guilty':1, 'unclear':1, 'bump':1, 'wins':-1, 'great':-1, 'outsider':1, 'problematic':1, 'substantial':-1, 'agrees':-1, 'conversation':-1, 'boundaries':-1, 'pro':-1, 'lease':0, 'fee':1, 'wanting':0, 'comfortable':-1, +'pretending':1, 'mad':1, 'begins':-1, 'timing':0, 'practical':-1, 'detail':0, 'pay':1, 'deals':-1, 'consequences':1, 'therapists':0, 'referring':0, 'desperate':1, 'cousins':0, 'relieved':-1} + +