diff --git a/examples/basic_model_DRNN.py b/examples/basic_model_DRNN.py new file mode 100644 index 0000000..378c9a2 --- /dev/null +++ b/examples/basic_model_DRNN.py @@ -0,0 +1,163 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators import SequenceClassificationCollator +from slp.data.therapy_title import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.basic_model_DRNN import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer.trainer_title_no_validation import SequentialTrainer + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device=DEVICE) + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=False, + num_workers=0, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + num_workers=0, + sampler=val_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.0005) + + metrics = { + 'accuracy': Accuracy(), + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir='../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 4 + batch_val = 4 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 4 + hidden_size = 300 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) + to_tensor = ToTensor(device=DEVICE) + + bio = PsychologicalDataset( + '../data/balanced_new_csv.csv', '../../../test_CEL/slp/data/psychotherapy/', + max_word_length, + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): + trainer = trainer_factory(embeddings, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/examples/kernel-lexicon-algorithm.py b/examples/kernel-lexicon-algorithm.py new file mode 100644 index 0000000..b9959dd --- /dev/null +++ b/examples/kernel-lexicon-algorithm.py @@ -0,0 +1,59 @@ +import padasip as pa +import numpy as np +import matplotlib.pylab as plt + +from torch.utils.data import DataLoader, SubsetRandomSampler + +#from slp.data.diction import seeds_diction +from slp.util.embeddings import EmbeddingsLoader +from slp.data.transforms import SpacyTokenizer +from slp.data.therapy_lexicon import PsychologicalDataset, TupleDataset +from sklearn.metrics.pairwise import cosine_similarity + +DATASET = '../../../whole-dataset.csv' + + +if __name__ == '__main__': + + Kseeds = 200 + max_word_length = 150 +# seed_set = list(seeds_diction.keys()) + + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + + tokenizer = SpacyTokenizer() + bio = PsychologicalDataset( + DATASET, + '../../../test_CEL/slp/data/psychotherapy', + max_word_length, + text_transforms = tokenizer) + + corpus = [] + for i, (text, title, lab) in enumerate(bio): + corpus.extend(text) + + + + import pdb; pdb.set_trace() + corpus = np.unique(corpus) + vocabulary = [word for word in corpus if word not in seed_set] + Nwords = len(vocabulary) + + #x-input matrix initialization + x = np.zeros(Kseeds, Nwords) + i = 0 + for word in vocabulary: + wv = word2idx[word] + j = 0 + for seed in seed_set: + ws = word2idx[seed] + d = cosine_similarity(wv, ws) + x[i][j] = d * seeds_diction[seed] + j += 1 + i += 1 + + #filter definition + f = pa.filters.FilterLMS(n=Nwords, mu=0.01, w="random") + mul = np.matmul(x, d) + f.run(mul, x) diff --git a/examples/therapy.py b/examples/therapy.py new file mode 100644 index 0000000..8ceabf3 --- /dev/null +++ b/examples/therapy.py @@ -0,0 +1,185 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators import SequenceClassificationCollator +from slp.data.therapy import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.hier_att_net import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer import SequentialTrainer + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device=DEVICE) + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + sampler=val_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.001) + + metrics = { + 'accuracy': Accuracy(), + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir='../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 8 + batch_val = 8 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 8 + hidden_size = 300 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) +# to_tensor = ToTensor(device='cpu') + to_tensor = ToTensor(device=DEVICE) + + bio = PsychologicalDataset( + '../data/balanced_new_csv.csv', '../data/psychotherapy/', + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + +# train_loader, val_loader = train_test_split(bio, batch_train, batch_val, test_size=.2) + + + #model = HierAttNet( + # hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + #model = model.to(DEVICE) + #criterion = nn.CrossEntropyLoss() + #optimizer = Adam(model.parameters(), lr=0.001) + + #metrics = { + # 'accuracy': Accuracy(), + # 'loss': Loss(criterion) + #} + + + #trainer = SequentialTrainer( +# model, +# optimizer, +# checkpoint_dir='../checkpoints' if not DEBUG else None, +# metrics=metrics, +# non_blocking=True, +# patience=10, +# loss_fn=criterion, +# device=DEVICE) + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): + trainer = trainer_factory(embeddings, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/examples/therapy_title.py b/examples/therapy_title.py new file mode 100644 index 0000000..0d105fa --- /dev/null +++ b/examples/therapy_title.py @@ -0,0 +1,189 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy, Precision, Recall +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators import SequenceClassificationCollator +from slp.data.therapy_title import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.basic_model import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer.trainer_title_no_validation import SequentialTrainer + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device=DEVICE) + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=True, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + sampler=val_sampler, + drop_last=True, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + #import pdb; pdb.set_trace() + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.0005) + precision = Precision(average=False) + recall = Recall(average=False) + avg_prec = precision.mean() + avg_rec = recall.mean() + F1 = (precision * recall * 2/(precision + recall + 1e-7)).mean() + + metrics = { + 'accuracy': Accuracy(), + 'precision': avg_prec, + 'recall': avg_rec, + 'f1': F1, + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir='../checkpoints' if not DEBUG else None, + metrics=metrics, + model_checkpoint = '../experiment_model.best.pth', + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 8 + batch_val = 8 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 8 + hidden_size = 300 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) + to_tensor = ToTensor(device=DEVICE) + + bio = PsychologicalDataset( +# '../data/balanced_new_csv.csv', +# '../../../test_dataset.csv', +# '../../../depressive_dataset.csv', + '../../../whole-dataset.csv', + '../../../test_CEL/slp/data/psychotherapy/', + max_word_length, + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + de = 0 + nd = 0 + m = 0 + for i, (t,x,feat, l) in enumerate(bio): + m += 1 + if (l==1): + de += 1 + else: + nd += 1 + print(m) + print("----------------") + print(de) + print(nd) + print("----------------") + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): + #import pdb; pdb.set_trace() + trainer = trainer_factory(embeddings, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/examples/therapy_title_attentional_embed.py b/examples/therapy_title_attentional_embed.py new file mode 100644 index 0000000..4049e6e --- /dev/null +++ b/examples/therapy_title_attentional_embed.py @@ -0,0 +1,164 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators_title import SequenceClassificationCollator +from slp.data.therapy_title import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.hier_att_net_title_attentional_embed import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer.trainer_title_no_validation import SequentialTrainer + +DEVICE = 'cpu' +#DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device='cpu') + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=False, + num_workers=0, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + num_workers=0, + sampler=val_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, idx2word, lex_size, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings, idx2word, lex_size) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.0005) + + metrics = { + 'accuracy': Accuracy(), + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir='../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 100 + batch_val = 100 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 100 + hidden_size = 300 + lex_size = 99 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) + to_tensor = ToTensor(device='cpu') + + bio = PsychologicalDataset( + '../data/balanced_new_csv.csv', '../../../test_CEL/slp/data/psychotherapy/', + max_word_length, + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): + trainer = trainer_factory(embeddings, idx2word, lex_size, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, idx2word, lex_size, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/examples/therapy_title_attentional_embed_on2.py b/examples/therapy_title_attentional_embed_on2.py new file mode 100644 index 0000000..0c6d3ca --- /dev/null +++ b/examples/therapy_title_attentional_embed_on2.py @@ -0,0 +1,160 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators_title import SequenceClassificationCollator +from slp.data.therapy_title_on2 import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.hier_att_net_title_attentional_embed import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer.trainer_title_no_validation import SequentialTrainer + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device=DEVICE) + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + sampler=val_sampler, + drop_last=False, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, idx2word, lex_size, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings, idx2word, lex_size) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.0005) + + metrics = { + 'accuracy': Accuracy(), + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir='../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 8 + batch_val = 8 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 8 + hidden_size = 300 + lex_size = 99 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) + to_tensor = ToTensor(device=DEVICE) + + bio = PsychologicalDataset( + '../data/balanced_new_csv.csv', '../../../test_CEL/slp/data/psychotherapy/', + max_word_length, + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): + trainer = trainer_factory(embeddings, idx2word, lex_size, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, idx2word, lex_size, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/examples/therapy_unbalanced.py b/examples/therapy_unbalanced.py new file mode 100644 index 0000000..5b73765 --- /dev/null +++ b/examples/therapy_unbalanced.py @@ -0,0 +1,191 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.optim import Adam + +from ignite.metrics import Loss, Accuracy, Precision, Recall +from torch.utils.data import DataLoader, SubsetRandomSampler +from torchvision.transforms import Compose +from sklearn.model_selection import KFold + +from slp.data.collators import SequenceClassificationCollator +from slp.data.therapy_title import PsychologicalDataset, TupleDataset +from slp.data.transforms import SpacyTokenizer, ToTokenIds, ToTensor, ReplaceUnknownToken +from slp.modules.basic_model import HierAttNet +from slp.util.embeddings import EmbeddingsLoader +from slp.trainer.trainer_title_no_validation import SequentialTrainer + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +COLLATE_FN = SequenceClassificationCollator(device=DEVICE) + +DEBUG = False +KFOLD = True +MAX_EPOCHS = 50 + +def dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val): + train_sampler = SubsetRandomSampler(train_indices) + val_sampler = SubsetRandomSampler(val_indices) + + train_loader = DataLoader( + dataset, + batch_size=batch_train, + sampler=train_sampler, + drop_last=True, + collate_fn=COLLATE_FN) + val_loader = DataLoader( + dataset, + batch_size=batch_val, + sampler=val_sampler, + drop_last=True, + collate_fn=COLLATE_FN) + + return train_loader, val_loader + +def train_test_split(dataset, batch_train, batch_val, + test_size=0.1, shuffle=True, seed=42): + dataset_size = len(dataset) + indices = list(range(dataset_size)) + test_split = int(np.floor(test_size * dataset_size)) + if shuffle: + if seed is not None: + np.random.seed(seed) + np.random.shuffle(indices) + + train_indices = indices[test_split:] + val_indices = indices[:test_split] + + import pdb; pdb.set_trace() + return dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + + +def kfold_split(dataset, batch_train, batch_val, k=5, shuffle=True, seed=None): + kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed) + for train_indices, val_indices in kfold.split(dataset): + yield dataloaders_from_indices(dataset, train_indices, val_indices, batch_train, batch_val) + +def trainer_factory(embeddings, device=DEVICE): + model = HierAttNet( + hidden_size, batch_size, num_classes, max_sent_length, len(embeddings), embeddings) + model = model.to(DEVICE) + criterion = nn.CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.0005) + + precision = Precision(average=False) + recall = Recall(average=False) + avg_prec = precision.mean() + avg_rec = recall.mean() + F1 = (precision * recall * 2 / (precision + recall + 1e-7)).mean() + metrics = { + 'accuracy': Accuracy(), + 'presicion': avg_prec, + 'recall': avg_rec, + 'f1': F1, + 'loss': Loss(criterion) + } + + trainer = SequentialTrainer( + model, + optimizer, + checkpoint_dir=None, # '../checkpoints' if not DEBUG else None, + metrics=metrics, + non_blocking=True, + patience=10, + loss_fn=criterion, + device=DEVICE) + + return trainer + + +if __name__ == '__main__': + + ####### Parameters ######## + batch_train = 8 + batch_val = 8 + + max_sent_length = 500 #max number of sentences (turns) in transcript - after padding + max_word_length = 150 #max length of each sentence (turn) - after padding + num_classes = 2 + batch_size = 8 + hidden_size = 300 + + epochs = 40 + +# loader = EmbeddingsLoader('../data/glove.6B.300d.txt', 300) + + loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) + word2idx, idx2word, embeddings = loader.load() + embeddings = torch.tensor(embeddings) + + tokenizer = SpacyTokenizer() + replace_unknowns = ReplaceUnknownToken() + to_token_ids = ToTokenIds(word2idx) + to_tensor = ToTensor(device=DEVICE) + + bio = PsychologicalDataset( +# '../data/balanced_new_csv.csv', +# '../../../test_dataset.csv', +# '../../../depressive_dataset.csv', +# '../../../unbalanced_dataset.csv', + '../../../whole-dataset.csv', + '../../../test_CEL/slp/data/psychotherapy/', + max_word_length, + text_transforms = Compose([ + tokenizer, + replace_unknowns, + to_token_ids, + to_tensor])) + + de = 0 + nd = 0 + m = 0 +# for i, (t,x,f,l) in enumerate(bio): +# m += 1 +# if (l==1): +# de += 1 +# else: +# nd += 1 + print(m) + print("----------------") + print(de) + print(nd) + print("----------------") + + + + if KFOLD: + cv_scores = [] + import gc + for train_loader, val_loader in kfold_split(bio, batch_train, batch_val): +# import pdb; pdb.set_trace() + trainer = trainer_factory(embeddings, device=DEVICE) + fold_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + cv_scores.append(fold_score) + print("**********************") + print("edw") + print(fold_score) + del trainer + gc.collect() + final_score = float(sum(cv_scores)) / len(cv_scores) + else: + train_loader, val_loader = train_test_split(bio, batch_train, batch_val) + trainer = trainer_factory(embeddings, device=DEVICE) + final_score = trainer.fit(train_loader, val_loader, epochs=MAX_EPOCHS) + + print(f'Final score: {final_score}') + + + + + if DEBUG: + print("Starting end to end test") + print("-----------------------------------------------------------------------") + trainer.fit_debug(train_loader, val_loader) + print("Overfitting single batch") + print("-----------------------------------------------------------------------") + trainer.overfit_single_batch(train_loader) +# else: +# print("started the else part") +# trainer.fit(train_loader, val_loader, epochs = epochs) diff --git a/slp/data/collators.py b/slp/data/collators.py index 222fa82..2a3711f 100644 --- a/slp/data/collators.py +++ b/slp/data/collators.py @@ -3,6 +3,7 @@ from slp.modules.util import pad_mask, subsequent_mask from slp.util import mktensor +from slp.data.transforms import ToTensor class SequenceClassificationCollator(object): @@ -11,15 +12,62 @@ def __init__(self, pad_indx=0, device='cpu'): self.device = device def __call__(self, batch): - inputs, targets = map(list, zip(*batch)) + inputs, titles, features, targets = map(list, zip(*batch)) + lengths = torch.tensor([len(s) for s in inputs], device=self.device) + lengths_title = torch.tensor([len(t) for t in titles], device=self.device) + + # Pad and convert to tensor + inputs = (pad_sequence(inputs, + batch_first=True, + padding_value=self.pad_indx) + .to(self.device)) + titles = (pad_sequence(titles, + batch_first=True, + padding_value=self.pad_indx) + .to(self.device)) + + targets = mktensor(targets, device=self.device, dtype=torch.long) + features = mktensor(features, device=self.device) + return inputs, titles, features.to(self.device), targets.to(self.device), lengths, lengths_title + + +class BertCollator(object): + def __init__(self, pad_indx=0, device='cpu'): + self.pad_indx = pad_indx + self.device = device + + def __call__(self, batch): +# import pdb; pdb.set_trace() + inputs, targets = map(list, zip(*batch)) # Pad and convert to tensor inputs = (pad_sequence(inputs, batch_first=True, padding_value=self.pad_indx) .to(self.device)) targets = mktensor(targets, device=self.device, dtype=torch.long) - return inputs, targets.to(self.device), lengths + + attention_masks = [] + segments = [] + for seq in inputs: + seq_mask = [float(i>0) for i in seq] + attention_masks.append(seq_mask) + segm = [0] * len(seq) + segments.append(segm) + + masks = mktensor(attention_masks, device=self.device, dtype=torch.long) + segments = mktensor(segments, device=self.device, dtype=torch.long) + + return inputs, targets.to(self.device), masks.to(self.device), segments.to(self.device) + + + + + + + + + class TransformerCollator(object): diff --git a/slp/data/collators_title.py b/slp/data/collators_title.py new file mode 100644 index 0000000..2178290 --- /dev/null +++ b/slp/data/collators_title.py @@ -0,0 +1,84 @@ +import torch +from torch.nn.utils.rnn import pack_padded_sequence , pad_sequence + +from slp.modules.util import pad_mask, subsequent_mask +from slp.util import mktensor +from slp.data.therapy_title import pad_sequence as pad_sequence1 + +class SequenceClassificationCollator(object): + def __init__(self, pad_indx=0, device='cpu'): + self.pad_indx = pad_indx + self.device = device + + def __call__(self, batch): + #import pdb; pdb.set_trace() + + inputs, titles, targets = map(list, zip(*batch)) + number_of_sentences = torch.tensor([len(s) for s in inputs], device=self.device) + length_of_sentences = ([torch.tensor([len(s) for s in inp]) for inp in inputs]) + + inputs = [pad_sequence1(i, padding_len=150, batch_first=True, padding_value=0) for i in inputs] + + + # Pad and convert to tensor + inputs = (pad_sequence(inputs, + batch_first=True, + padding_value=self.pad_indx) + .to(self.device)) + length_of_sentences = pad_sequence1(length_of_sentences, padding_len=inputs.shape[1], batch_first=True, padding_value=1) + + lengths_title = torch.tensor([len(t) for t in titles], device=self.device) + titles = (pad_sequence(titles, + batch_first=True, + padding_value=self.pad_indx) + .to(self.device)) + + targets = mktensor(targets, device=self.device, dtype=torch.long) + return inputs, titles, targets.to(self.device), number_of_sentences, length_of_sentences, lengths_title + + +class TransformerCollator(object): + def __init__(self, pad_indx=0, device='cpu'): + self.pad_indx = pad_indx + self.device = device + + def pad_and_mask(self, tensors): + lengths = torch.tensor([len(s) for s in tensors], + device=self.device) + max_length = torch.max(lengths) + pad_m = pad_mask(lengths, max_length=max_length, device=self.device) + sub_m = subsequent_mask(max_length) + tensors = (pad_sequence(tensors, + batch_first=True, + padding_value=self.pad_indx) + .to(self.device)) + return tensors, pad_m, sub_m + + @staticmethod + def get_inputs_and_targets(batch): + inputs, targets = map(list, zip(*batch)) + return inputs, targets + + def __call__(self, batch): + inputs, targets = self.get_inputs_and_targets(batch) + inputs, pad_m_inputs, _ = self.pad_and_mask(inputs) + targets, pad_m_targets, sub_m = self.pad_and_mask(targets) + mask_targets = pad_m_targets.unsqueeze(-2) * sub_m + mask_inputs = pad_m_inputs.unsqueeze(-2) + return inputs, targets, mask_inputs, mask_targets + + +class PackedSequenceCollator(object): + def __init__(self, pad_indx=0, device='cpu', batch_first=True): + self.seq_collator = SequenceClassificationCollator( + pad_indx=pad_indx, device=device) + self.batch_first = batch_first + self.device = device + + def __call__(self, batch): + inputs, targets, lengths = self.seq_collator(batch) + inputs = pack_padded_sequence( + inputs, lengths, + batch_first=self.batch_first, + enforce_sorted=False) + return inputs, targets.to(self.device), lengths[inputs.sorted_indices] diff --git a/slp/data/collators_title_touvlo.py b/slp/data/collators_title_touvlo.py new file mode 100644 index 0000000..5bf2617 --- /dev/null +++ b/slp/data/collators_title_touvlo.py @@ -0,0 +1,84 @@ +import torch +from torch.nn.utils.rnn import pack_padded_sequence , pad_sequence + +from slp.modules.util import pad_mask, subsequent_mask +from slp.util import mktensor +from slp.data.therapy_title import pad_sequence as pad_sequence1 + +class SequenceClassificationCollator(object): + def __init__(self, pad_indx=0, device='cpu'): + self.pad_indx = pad_indx + self.device = device + + def __call__(self, batch): +# import pdb; pdb.set_trace() + padding_len=150 + inputs, titles, targets = map(list, zip(*batch)) + number_of_sentences = torch.tensor([len(s) for s in inputs], device=self.device) + length_of_sentences = ([torch.tensor([len(s) if len(s) padding_len: + tensor = tensor[:padding_len] + length = min(tensor.size(0), padding_len) + # use index notation to prevent duplicate references to the tensor + if batch_first: + out_tensor[i, :length, ...] = tensor + else: + out_tensor[:length, i, ...] = tensor + return out_tensor + + + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.strict = False + self.convert_charrefs= True + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def get_data(self): + return ''.join(self.fed) + +def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() + + +class TupleDataset(Dataset): + def __init__(self, data): + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + sample, label = self.data[idx] + return sample, label + + +class PsychologicalDataset(Dataset): + def __init__(self, csv_file, root_dir, max_word_len=25, text_transforms=None): + self.file = pd.read_csv(csv_file) + self.root_dir = root_dir + self.max_word_len = max_word_len + self.text_transforms = text_transforms + self.transcript, self.label, self.metadata, self.title = self.get_files_labels_metadata(self.root_dir, self.file) + + + def get_files_labels_metadata(self, root_dir, _file): + included_cols = [1,11,12,13,14,15,16,20,22] + #included_cols_names = ['file name', 'session title', 'client gender', + # 'client age', 'client marital status', + # 'client sexual orientation', 'therapist gender', + # 'therapist experience', 'psych. subject', 'therapies'] + + transcripts = [] + labels = [] + title = [] + metadata = [] + + rows = [] + _file = "../data/balanced_new_csv.csv" + with open(_file) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + rows.append(list(row)) + + for i in range(len(rows)): + labels.append(rows[i][21]) + title.append(rows[i][5]) + metadata.append([rows[i][y] for y in included_cols]) + + for i in range(len(rows)): + f = int(float(rows[i][1])) + filename = str(os.path.join(self.root_dir, str(f))) + '.txt' + fp = open(filename,'r+') + transcript = fp.read() + transcripts.append(transcript) + + return transcripts, labels, metadata, title + + + def __len__(self): + return len(self.label) + + + def __getitem__(self, idx): + preprocessed_text = self.transcript[idx] + label = self.label[idx].split("; ") + title = self.title[idx] + metadata = self.metadata[idx] + + + + if self.text_transforms is not None: + lista = [] + turns = [] + p = strip_tags(preprocessed_text) + p = p.split("\n") + + p = [x for x in p if x!=''] + + for i in p: + i = i.split(":") + if len(i) == 2: + turns.append(i[0]) + lista.append(self.text_transforms(i[1])) + #import pdb; pdb.set_trace() + preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=self.max_word_len) + + preprocessed_title = self.text_transforms(title) + + lab = int("Depression (emotion)" in label) + return (preprocessed_text, preprocessed_title, lab) diff --git a/slp/data/therapy_lexicon.py b/slp/data/therapy_lexicon.py new file mode 100644 index 0000000..623c827 --- /dev/null +++ b/slp/data/therapy_lexicon.py @@ -0,0 +1,125 @@ +import pandas as pd +import os +import csv + +from html.parser import HTMLParser +from sklearn.preprocessing import MultiLabelBinarizer +from torch.utils.data import Dataset +from itertools import groupby + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.strict = False + self.convert_charrefs= True + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def get_data(self): + return ''.join(self.fed) + +def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() + +class TupleDataset(Dataset): + def __init__(self, data): + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + sample, label = self.data[idx] + return sample, label + +class PsychologicalDataset(Dataset): + def __init__(self, csv_file, root_dir, max_word_len=25, text_transforms=None): + self._file = csv_file + self.root_dir = root_dir + self.max_word_len = max_word_len + self.transforms = text_transforms + self.transcript, self.label, self.metadata, self.title = self.get_files_labels_metadata(self.root_dir, self._file) + self.patient_turns = ['FEMALE CLIENT', 'MALE CLIENT', 'Audience', 'CLIENT','PT','PATIENT','CL','Client','Danny','Juan', 'MAN', + 'PARTICIPANT', 'CG', 'MAN', 'RESPONDENT','F','Angie','Jeff', 'Bill', 'Jim', 'Leah', 'Kelly', 'MRS. NAVNOR', + 'MR. NAVNOR', 'MICHELLE', 'Phil', 'FEMALE PARTICIPANT', 'Mom', 'Nicole', 'LINDA', + 'MALE PARTICIPANT', 'Blake', 'M', 'Claudette', 'MR. VAC', 'Marie', 'Robin', 'Mike', 'Gina', 'FEMALE', 'LORI' + ,'Joshua', 'Shayla', 'Greg', 'Barbara', 'MARGE', 'ANN LARKIN', 'EDWARD', 'Mark', 'PATiENT'] + self.therapist_turns = ['CONNIRAE', 'ANALYST', 'THERAPIST','COUNSELOR','DR','M','Therapist','Marlatt', 'Lazarus','INTERVIEWER', + 'TH', 'Johnson', 'Scharff', 'T', 'Counselor', 'Wubbolding', 'DR. WARKENTIN', 'Bugental', 'Powers', 'Koocher', + 'Dr. Sklare', 'BECKER', 'Hardy', 'MODERATOR', 'Masek', 'VIRGINIA','MODERATOR', 'Oaklander', 'McCrady', + 'Bugental', 'Krumboltz', 'Miller', 'ANDREAS', 'Kottman', 'Utigaard', 'Wubbolding', 'Carlson', 'JOSH LOMAN', + 'Zweben'] + + + def get_files_labels_metadata(self, root_dir, _file): + included_cols = [1,11,12,13,14,15,16,20,22] + #included_cols_names = ['file name', 'session title', 'client gender', + # 'client age', 'client marital status', + # 'client sexual orientation', 'therapist gender', + # 'therapist experience', 'psych. subject', 'therapies'] + + transcripts = [] + labels = [] + title = [] + metadata = [] + + rows = [] +# _file = "../data/balanced_new_csv.csv" + with open(_file) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + rows.append(list(row)) + + for i in range(len(rows)): + labels.append(rows[i][21]) + title.append(rows[i][5]) + metadata.append([rows[i][y] for y in included_cols]) + + for i in range(len(rows)): + f = int(float(rows[i][1])) + filename = str(os.path.join(self.root_dir, str(f))) + '.txt' + fp = open(filename,'r+') + transcript = fp.read() + transcripts.append(transcript) + + return transcripts, labels, metadata, title + + + def __len__(self): + return len(self.label) + + + def __getitem__(self, idx): + preprocessed_text = self.transcript[idx] + label = self.label[idx].split("; ") + title = self.title[idx] + metadata = self.metadata[idx] + + mean_length = 0 + import re + if self.transforms is not None: + lista = [] + turns = [] + total_turns = [] + p = strip_tags(preprocessed_text) + p = p.split("\n") + p1 = [x for x in p if x!=''] +# import pdb; pdb.set_trace() + for i in p1: + i = i.split(":") + if len(i) >= 2: + lista.extend(self.transforms(i[1])) + + if len(lista) == 0: + import pdb; pdb.set_trace() + + preprocessed_text = lista + preprocessed_title = self.transforms(title) + + lab = int("Depressive disorder" in metadata[7] or "Depressive disorder" in label + or "Depression (emotion)" in label or "Depression (eotion)" in metadata[7]) + + return (preprocessed_text, preprocessed_title, lab) + diff --git a/slp/data/therapy_title.py b/slp/data/therapy_title.py new file mode 100644 index 0000000..8ee9e8f --- /dev/null +++ b/slp/data/therapy_title.py @@ -0,0 +1,192 @@ +import pandas as pd +import os +import csv + +from html.parser import HTMLParser +from sklearn.preprocessing import MultiLabelBinarizer +from torch.utils.data import Dataset +from itertools import groupby + +def pad_sequence(sequences, batch_first=False, padding_len=None, padding_value=0): + # assuming trailing dimensions and type of all the Tensors + # in sequences are same and fetching those from sequences[0] +# import pdb; pdb.set_trace() + max_size = sequences[0].size() + + trailing_dims = max_size[1:] + if padding_len is not None: + max_len = padding_len + else: + max_len = max([s.size(0) for s in sequences]) + if batch_first: + out_dims = (len(sequences), max_len) + trailing_dims + else: + out_dims = (max_len, len(sequences)) + trailing_dims + + out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value) + for i, tensor in enumerate(sequences): + if tensor.size(0) > padding_len: + tensor = tensor[:padding_len] + length = min(tensor.size(0), padding_len) + # use index notation to prevent duplicate references to the tensor + if batch_first: + out_tensor[i, :length, ...] = tensor + else: + out_tensor[:length, i, ...] = tensor + return out_tensor + + + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.strict = False + self.convert_charrefs= True + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def get_data(self): + return ''.join(self.fed) + +def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() + + +class TupleDataset(Dataset): + def __init__(self, data): + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + sample, label = self.data[idx] + return sample, label + + +class PsychologicalDataset(Dataset): + def __init__(self, csv_file, root_dir, max_word_len=25, text_transforms=None): + self._file = csv_file + self.root_dir = root_dir + self.max_word_len = max_word_len + self.text_transforms = text_transforms + self.transcript, self.label, self.metadata, self.title = self.get_files_labels_metadata(self.root_dir, self._file) +# self.patient_turns = ['CLIENT','PT','PATIENT','CL','Client','Danny','Juan', 'MAN', +# 'PARTICIPANT', 'CG', 'MAN', 'RESPONDENT','F','Angie','Jeff', 'Bill', 'FEMALE PARTICIPANT','MALE PARTICIPANT', +# 'Koocher', 'Nicole', 'Blake'] +# self.therapist_turns = ['CONNIRAE', 'M', 'ANALYST', 'THERAPIST','COUNSELOR','DR','M','Therapist','Marlatt', +# 'Lazarus','INTERVIEWER','TH','Scharff', 'T', 'Counselor', 'Wubbolding', 'DR. WARKENTIN', 'MODERATOR', 'Leah' +# 'Masek', 'Oaklander'] + self.patient_turns = ['FEMALE CLIENT', 'MALE CLIENT', 'Audience', 'CLIENT','PT','PATIENT','CL','Client','Danny','Juan', 'MAN', + 'PARTICIPANT', 'CG', 'MAN', 'RESPONDENT','F','Angie','Jeff', 'Bill', 'Jim', 'Leah', 'Kelly', 'MRS. NAVNOR', + 'MR. NAVNOR', 'MICHELLE', 'Phil', 'FEMALE PARTICIPANT', 'Mom', 'Nicole', 'LINDA', + 'MALE PARTICIPANT', 'Blake', 'M', 'Claudette', 'MR. VAC', 'Marie', 'Robin', 'Mike', 'Gina', 'FEMALE', 'LORI' + ,'Joshua', 'Shayla', 'Greg', 'Barbara', 'MARGE', 'ANN LARKIN', 'EDWARD', 'Mark', 'PATiENT'] + self.therapist_turns = ['CONNIRAE', 'ANALYST', 'THERAPIST','COUNSELOR','DR','M','Therapist','Marlatt', 'Lazarus','INTERVIEWER', + 'TH', 'Johnson', 'Scharff', 'T', 'Counselor', 'Wubbolding', 'DR. WARKENTIN', 'Bugental', 'Powers', 'Koocher', + 'Dr. Sklare', 'BECKER', 'Hardy', 'MODERATOR', 'Masek', 'VIRGINIA','MODERATOR', 'Oaklander', 'McCrady', + 'Bugental', 'Krumboltz', 'Miller', 'ANDREAS', 'Kottman', 'Utigaard', 'Wubbolding', 'Carlson', 'JOSH LOMAN', + 'Zweben'] + + + def get_files_labels_metadata(self, root_dir, _file): + included_cols = [1,11,12,13,14,15,16,20,22] + #included_cols_names = ['file name', 'session title', 'client gender', + # 'client age', 'client marital status', + # 'client sexual orientation', 'therapist gender', + # 'therapist experience', 'psych. subject', 'therapies'] + + transcripts = [] + labels = [] + title = [] + metadata = [] + + rows = [] +# _file = "../data/balanced_new_csv.csv" + with open(_file) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + rows.append(list(row)) + + for i in range(len(rows)): + labels.append(rows[i][21]) + title.append(rows[i][5]) + metadata.append([rows[i][y] for y in included_cols]) + + for i in range(len(rows)): + f = int(float(rows[i][1])) + filename = str(os.path.join(self.root_dir, str(f))) + '.txt' + fp = open(filename,'r+') + transcript = fp.read() + transcripts.append(transcript) + + return transcripts, labels, metadata, title + + + def __len__(self): + return len(self.label) + + + def __getitem__(self, idx): + preprocessed_text = self.transcript[idx] + label = self.label[idx].split("; ") + title = self.title[idx] + metadata = self.metadata[idx] + + mean_length = 0 + import re + if self.text_transforms is not None: + lista = [] + turns = [] + total_turns = [] + p = strip_tags(preprocessed_text) + p = p.split("\n") + p1 = [x for x in p if x!=''] +# import pdb; pdb.set_trace() + for i in p1: + i = i.split(":") +# if any(c in i[0] for c in self.therapist_turns): + if len(i) != 1 and not '' in i: + s = self.text_transforms(i[1]) + if len(s) >= 5: + if (i[0] in self.patient_turns): + turns.append(i[0]) + total_turns.append(0) + lista.append(s) + mean_length = mean_length + len(i[1]) + + elif (i[0] not in self.therapist_turns): + match = re.match(r"([a-z]+)([0-9]+)", i[0], re.I) or re.match(r"([a-z]+)( )([0-9]+)", i[0], re.I) + if match: + items = match.groups() + if ((items[0] in self.patient_turns) or (i[0] in self.therapist_turns)): + turns.append(i[0]) + total_turns.append(0) + lista.append(s) + mean_length = mean_length + len(i[1]) + + elif (i[0] in self.therapist_turns): + total_turns.append(1) + + if len(lista) == 0: + import pdb; pdb.set_trace() + + preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=self.max_word_len) + preprocessed_title = self.text_transforms(title) + + lab = int("Depressive disorder" in metadata[7] or "Depressive disorder" in label + or "Depression (emotion)" in label or "Depression (eotion)" in metadata[7]) + + mean_lengths = round(mean_length/len(lista)) + turns_no = len(lista) + grouped_L = [(k, sum(1 for i in g)) for k,g in groupby(total_turns)] + + mx = 1 + for i in grouped_L: + if i[0] == 0 and i[1] > mx: + mx = i[1] + features = [mean_lengths, turns_no, mx] + return (preprocessed_text, preprocessed_title, features, lab) + diff --git a/slp/data/therapy_title_on2.py b/slp/data/therapy_title_on2.py new file mode 100644 index 0000000..23f170f --- /dev/null +++ b/slp/data/therapy_title_on2.py @@ -0,0 +1,170 @@ +import pandas as pd +import os +import csv + +from html.parser import HTMLParser +from sklearn.preprocessing import MultiLabelBinarizer +from torch.utils.data import Dataset + + +def pad_sequence(sequences, batch_first=False, padding_len=None, padding_value=0): + # assuming trailing dimensions and type of all the Tensors + # in sequences are same and fetching those from sequences[0] +# import pdb; pdb.set_trace() + max_size = sequences[0].size() + + trailing_dims = max_size[1:] + if padding_len is not None: + max_len = padding_len + else: + max_len = max([s.size(0) for s in sequences]) + if batch_first: + out_dims = (len(sequences), max_len) + trailing_dims + else: + out_dims = (max_len, len(sequences)) + trailing_dims + + out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value) + for i, tensor in enumerate(sequences): + if tensor.size(0) > padding_len: + tensor = tensor[:padding_len] + length = min(tensor.size(0), padding_len) + # use index notation to prevent duplicate references to the tensor + if batch_first: + out_tensor[i, :length, ...] = tensor + else: + out_tensor[:length, i, ...] = tensor + return out_tensor + + + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.strict = False + self.convert_charrefs= True + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def get_data(self): + return ''.join(self.fed) + +def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() + + +class TupleDataset(Dataset): + def __init__(self, data): + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + sample, label = self.data[idx] + return sample, label + + +class PsychologicalDataset(Dataset): + def __init__(self, csv_file, root_dir, max_word_len=25, text_transforms=None): + self.file = pd.read_csv(csv_file) + self.root_dir = root_dir + self.max_word_len = max_word_len + self.text_transforms = text_transforms + self.transcript, self.label, self.metadata, self.title = self.get_files_labels_metadata(self.root_dir, self.file) + self.patient_turns = ['CLIENT','PT','PATIENT','CL','Client','Danny','Juan', + 'PARTICIPANT','CG', 'RESPONDENT','F','Angie','Jeff', 'Bill'] + + + def get_files_labels_metadata(self, root_dir, _file): + included_cols = [1,11,12,13,14,15,16,20,22] + #included_cols_names = ['file name', 'session title', 'client gender', + # 'client age', 'client marital status', + # 'client sexual orientation', 'therapist gender', + # 'therapist experience', 'psych. subject', 'therapies'] + + transcripts = [] + labels = [] + title = [] + metadata = [] + + rows = [] + _file = "../data/balanced_new_csv.csv" + with open(_file) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + rows.append(list(row)) + + for i in range(len(rows)): + labels.append(rows[i][21]) + title.append(rows[i][5]) + metadata.append([rows[i][y] for y in included_cols]) + + for i in range(len(rows)): + f = int(float(rows[i][1])) + filename = str(os.path.join(self.root_dir, str(f))) + '.txt' + fp = open(filename,'r+') + transcript = fp.read() + transcripts.append(transcript) + + return transcripts, labels, metadata, title + + + def __len__(self): + return len(self.label) + + + def __getitem__(self, idx): + preprocessed_text = self.transcript[idx] + label = self.label[idx].split("; ") + title = self.title[idx] + metadata = self.metadata[idx] + + if self.text_transforms is not None: + lista = [] + turns = [] + p = strip_tags(preprocessed_text) + p = p.split("\n") + p1 = [x for x in p if x!=''] + + + for (i, j) in zip(p1[::2], p1[1::2]): + i = i.split(":") + j = j.split(":") + if len(i)!= 1 and len(j)!= 1: + turns.append(i[0]) + turns.append(j[0]) + d = i[1] + ' ' + j[1] + lista.append(self.text_transforms(d)) + + if len(lista) == 0: +# import pdb; pdb.set_trace() + for (i, j) in zip(p1[::2], p1[1::2]): + i = i.split(":") + j = j.split(":") + if len(i)!= 1: + turns.append(i[0]) + isum = i[1] + else: + isum = '' + + if len(j)!= 1: + turns.append(j[0]) + jsum = j[1] + else: + jsum = '' + + d = isum + ' ' + jsum + lista.append(self.text_transforms(d)) +# if len(lista) == 0: +# import pdb; pdb.set_trace() + + +# import pdb; pdb.set_trace() + preprocessed_text = pad_sequence(lista, batch_first=True, padding_len=self.max_word_len) + preprocessed_title = self.text_transforms(title) + + lab = int("Depression (emotion)" in label) + return (preprocessed_text, preprocessed_title, lab) + diff --git a/slp/data/transforms.py b/slp/data/transforms.py index 8ae2f22..3f9a7da 100644 --- a/slp/data/transforms.py +++ b/slp/data/transforms.py @@ -1,5 +1,6 @@ import spacy import torch +import re import sentencepiece as spm from transformers import BertTokenizer @@ -9,6 +10,11 @@ from slp.util import mktensor +def remove_punctuation(txt): + ch = "[.?:_'!,)(]" + txt = re.sub(ch, '', txt) + return txt + class SentencepieceTokenizer(object): def __init__( self, diff --git a/slp/load_lexicons/get_BL_features.py b/slp/load_lexicons/get_BL_features.py new file mode 100755 index 0000000..95552b0 --- /dev/null +++ b/slp/load_lexicons/get_BL_features.py @@ -0,0 +1,69 @@ +import os + +BASE_DIR = '../data/' + + +# Opinion Lexicon (or Sentiment Lexicon) - Bing Liu (~6.800 entries) +# -------------------------------------- +# format = dictionary with entries like this: +# word1={'positive': 1, 'negative': 0} +# word2={'positive': 0, 'negative': 1} + +def load_bingliu_lexicon(neg_file, pos_file): + + # returns Bing Liu Opinion lexicon in the form of a dictionary + # keys: words, values: "positive" or "negative" + + _data = {} + + # negative words + lines = open(neg_file, "r", encoding="utf-8").readlines() + lines = lines[35:] + + total_neg_words = len(lines) + + for line_id, line in enumerate(lines): + _row = line.rstrip().split('\t') + _word = _row[0] + _feature = "negative" + _data[_word] = _feature + + + # positive words + lines = open(pos_file, "r", encoding="utf-8").readlines() + lines = lines[35:] + + total_pos_words = len(lines) + cnt = 0 + for line_id, line in enumerate(lines): + _row = line.rstrip().split('\t') + _word = _row[0] + + if _word in _data.keys(): + cnt += 1 + _feature = "positive" + _data[_word] = _feature + + return _data, cnt, total_neg_words, total_pos_words + +#################################################### +# Load Bing Liu Opinion Lexicon +#################################################### + +# get the Bing Liu Opinion Lexicon in the form of a dictionary +# where keys are the unique words +# and values a scalar + +def bing_liu(): + # BL_LEX_PATH = os.path.join(BASE_DIR, 'lexicons_kate', 'Bing_Liu_opinion_lex') + BL_LEX_PATH = BASE_DIR + lexicon, both_pos_neg, neg_words, pos_words = load_bingliu_lexicon(neg_file=os.path.join(BL_LEX_PATH, 'negative-words.txt'), + pos_file=os.path.join(BL_LEX_PATH, 'positive-words.txt')) + lex = {} + for word in lexicon: + if lexicon[word] == 'negative': + lex[word] = [-1.] + elif lexicon[word] == 'positive': + lex[word] = [1.] + return lex + diff --git a/slp/load_lexicons/get_afinn_features.py b/slp/load_lexicons/get_afinn_features.py new file mode 100755 index 0000000..b88ac71 --- /dev/null +++ b/slp/load_lexicons/get_afinn_features.py @@ -0,0 +1,55 @@ +# AFINN is a list of English words rated for valence with an integer +# between minus five (negative) and plus five (positive). The words have +# been manually labeled by Finn Ã…rup Nielsen in 2009-2011. The file +# is tab-separated. Total words: 2477. + +import os + +BASE_DIR = '../data/' + + +def load_afinn_lexicon(): + + # returns AFINN lexicon in the form of a dictionary + # keys: words, values: valence score (integer -5 to +5) + +# file = os.path.join(BASE_DIR, 'lexicons_kate', 'AFINN', 'AFINN-111.txt') + file = os.path.join(BASE_DIR, 'AFINN-111.txt') + + _data = {} + + lines = open(file, "r", encoding="utf-8").readlines() + for line_id, line in enumerate(lines): + _row = line.rstrip().split('\t') + _word = _row[0] + _feature = _row[1] + _data[_word] = _feature + return _data + +def load_features(file): + + print("edw") + dim2num = {} # [dimension name]: corresponding number in lexicon list + num2dim = {} # the exact opposite + + lines = open(file, "r", encoding="utf-8").readlines() + for line_id, line in enumerate(lines): + _row = line.rstrip().split(" ") + _dim = _row[1] + dim2num[_dim] = line_id + num2dim[line_id] = _dim + return dim2num, num2dim + +#################################################### +# Load AFINN Lexicon +#################################################### + +# get the AFINN lexicon in the form of a dictionary +# where keys are the unique words +# and values a scalar +# +# total_words = len(lex) + + + + diff --git a/slp/load_lexicons/get_all_6lexicons.py b/slp/load_lexicons/get_all_6lexicons.py new file mode 100644 index 0000000..8681c78 --- /dev/null +++ b/slp/load_lexicons/get_all_6lexicons.py @@ -0,0 +1,75 @@ +import os +import torch + +import torch.nn as nn + +from get_afinn_features import load_afinn_lexicon +from get_BL_features import * +from get_liwc_features import load_liwc_lex , load_features +from get_mpqa_features import * +from get_semeval2015_twitter_features import * +from get_nrc_emolex_features import * +from slp.data.therapy_title import pad_sequence +from slp.data.transforms import ToTensor + +BASE_DIR = '../data/' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' +#DEVICE = 'cpu' + +class LexiconFeatures(nn.Module): + def __init__(self): + super(LexiconFeatures, self).__init__() + + self.afinn = load_afinn_lexicon() + self.BL = bing_liu() + self.liwc = load_liwc_lex() + self.liwc_classes, _ = load_features(os.path.join(BASE_DIR, 'PsycholinguisticDimensions.txt')) + self.mpqa = mpqa_lex() + self.semeval = semeval15_lexicon() + self.emolex = emolex() + + def forward(self, inputs, idx2word, padding_len): + to_tensor = ToTensor(device=DEVICE) + final_vector = [] +# import pdb; pdb.set_trace() + for inputt in inputs: + vec = [] + for inp in inputt: + word = idx2word[inp.item()] + vector = [] + if word in self.afinn: + vector.append(float(self.afinn[word])) + else: + vector.append(float(0)) + if word in self.semeval: + vector.append(self.semeval[word]) + else: + vector.append(float(0)) + if word in self.BL: + try: + vector.append(float(self.BL[word][0])) + except: + import pdb; pdb.set_trace() + else: + vector.append(float(0)) + if word in self.mpqa: + vector.extend(self.mpqa[word]) + else: + vector.extend([float(0)]*4) + if word in self.liwc: + v = [float(i) for i in self.liwc[word]] + vector.extend(v) + else: + vector.extend([float(0)]*73) + if word in self.emolex: + v = [float(i) for i in self.emolex[word]] + vector.extend(v) + else: + vector.extend([float(0)]*19) + vec.append(vector) + try: + final_vector.append(to_tensor(vec)) + except: + import pdb; pdb.set_trace() + final_vector = pad_sequence(final_vector, padding_len=padding_len, batch_first=True) + return final_vector diff --git a/slp/load_lexicons/get_all_lexicons.py b/slp/load_lexicons/get_all_lexicons.py new file mode 100644 index 0000000..dad58c0 --- /dev/null +++ b/slp/load_lexicons/get_all_lexicons.py @@ -0,0 +1,75 @@ +import os +import torch + +import torch.nn as nn + +from get_afinn_features import load_afinn_lexicon +from get_BL_features import * +from get_liwc_features import load_liwc_lex , load_features +from get_mpqa_features import * +from get_semeval2015_twitter_features import * +from get_nrc_emolex_features import * +from slp.data.therapy_title import pad_sequence +from slp.data.transforms import ToTensor + +BASE_DIR = '../data/' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +#def LexiconFeatures(inputs): + +class LexiconFeatures(nn.Module): + def __init__(self): + super(LexiconFeatures, self).__init__() + + self.afinn = load_afinn_lexicon() + self.BL = bing_liu() + self.liwc = load_liwc_lex() + self.liwc_classes, _ = load_features(os.path.join(BASE_DIR, 'PsycholinguisticDimensions.txt')) + self.mpqa = mpqa_lex() + self.semeval = semeval15_lexicon() + self.emolex = emolex() + + def forward(self, inputs, idx2word, padding_len): + to_tensor = ToTensor(device=DEVICE) + final_vector = [] +# import pdb; pdb.set_trace() + for inputt in inputs: + vec = [] + for inp in inputt: + word = idx2word[inp] + vector = [] + inp = inp.item() + if word in self.afinn: + vector.append(float(self.afinn[word])) + else: + vector.append(float(0)) + if word in self.semeval: + vector.append(self.semeval[word]) + else: + vector.append(float(0)) + if word in self.BL: +# import pdb; pdb.set_trace() + vector.append(float(self.BL[word[0])) + else: + vector.append(float(0)) + if word in self.mpqa: + vector.extend(self.mpqa[word]) + else: + vector.extend([float(0)]*4) + if word in self.liwc: + v = [float(i) for i in self.liwc[word]] + vector.extend(v) + else: + vector.extend([float(0)]*73) + if word in self.emolex: + v = [float(i) for i in self.emolex[word]] + vector.extend(v) + else: + vector.extend([float(0)]*19) + vec.append(vector) + try: + final_vector.append(to_tensor(vec)) + except: + import pdb; pdb.set_trace() + final_vector = pad_sequence(final_vector, padding_len=padding_len, batch_first=True) + return final_vector diff --git a/slp/load_lexicons/get_liwc_features.py b/slp/load_lexicons/get_liwc_features.py new file mode 100755 index 0000000..04d8343 --- /dev/null +++ b/slp/load_lexicons/get_liwc_features.py @@ -0,0 +1,99 @@ +import os + +#from sys_config import BASE_DIR +import matplotlib.pyplot as plt +import seaborn as sns + +BASE_DIR = '../data/' +# LIWC Lexicon http://lit.eecs.umich.edu/~geoliwc/LIWC_Dictionary.htm + +def load_liwc_lexicon(file): + # returns LIWC in the form of a dictionary + # keys: words, values: feature vector (list) + + + _data = {} + + lines = open(file, "r", encoding="utf-8").readlines() + for line_id, line in enumerate(lines): + _row = line.rstrip().split(" ") + _word = _row[0] + _features = _row[1:] + _data[_word] = _features + return _data + + +def load_features(file): + + dim2num = {} # [dimension name]: corresponding number in lexicon list + num2dim = {} # the exact opposite + + lines = open(file, "r", encoding="utf-8").readlines() + for line_id, line in enumerate(lines): + _row = line.rstrip().split(" ") + _dim = _row[1] + dim2num[_dim] = line_id + num2dim[line_id] = _dim + return dim2num, num2dim + + +#################################################### +# Load LIWC Lexicon +#################################################### + +def liwc_lex(): + # get the liwc lexicon in the form of a dictionary + # where keys are the unique words + # and values a list with all the dimensions (73 in total) + + + lex = load_liwc_lexicon( + os.path.join(BASE_DIR, 'PsycholinguisticLexicon.txt')) + + total_words = len(lex) + + # get the two dictionaries that relate every dimension name + # with its corresponding number (value) in the lexicon dimension list + dim2num, num2dim = load_features( + os.path.join(BASE_DIR, 'PsycholinguisticDimensions.txt')) + + #################################################### + # Plot statistics of LIWC Lexicon + #################################################### + + # The lexiconss has 18504 words and for each word a feature vector of size 71. + # Each dimension represents a category (for example affect, posemo, negemo etc) + # The vector contains '1' when this word is includied in the particular category. + # Otherwise '0'. + # Using a bar plot we can decide which dimensions of this feature vector are useful for our work. + + # initialization of count dictionary + dimensions = list(dim2num.keys()) + dim_counts = {dim: 0 for dim in dimensions} + + for word in lex: + ones = [i for i, x in enumerate(lex[word]) if x == '1'] + for index in ones: + dim_counts[num2dim[index]] += 1 + + sorted_tuples = sorted(dim_counts.items(), key=lambda kv: kv[1]) + + x = [k[1] for k in sorted_tuples if k[1] > 500] + y = [k[0] for k in sorted_tuples if k[1] > 500] + + + plt.figure() + sns.barplot(x=x, y=y) + plt.title('Number of words for each dimension of the LIWC lexicon') + # plt.show() + plt.savefig('liwc_dims_statistics.png') + # plt.close() + + print(len(lex)) + + +def load_liwc_lex(): + return load_liwc_lexicon( + os.path.join(BASE_DIR, 'PsycholinguisticLexicon.txt')) + + # liwc_lex() diff --git a/slp/load_lexicons/get_mpqa_features.py b/slp/load_lexicons/get_mpqa_features.py new file mode 100755 index 0000000..e8b79d9 --- /dev/null +++ b/slp/load_lexicons/get_mpqa_features.py @@ -0,0 +1,63 @@ +import os +import pickle + +BASE_DIR = '../data/' + +def mpqa_lex(): + path = os.path.join(BASE_DIR, 'mpqa.pickle') + with open(path, 'rb') as f: + data = pickle.load(f) + + pos = list(data["reinforcement"].keys())[0] + + # dictionary in the following form: + # {'word': + # {'POS': + # {'strength':weaksubj or strongsubj, + # 'positive': 0 or 1, + # 'negative': 0 or 1, + # 'polarity': 0 or 1}}} + + polarities = [] + strengths = [] + pos_tags = [] + negatives = [] + positives = [] + lexicon = {} + feat_lexicon = {} + for key in data: + pos = list(data[key].keys())[0] + lexicon[key] = {'pos': pos, + 'strength': data[key][pos]['strength'], + 'positive': data[key][pos]['positive'], + 'negative': data[key][pos]['negative'], + 'polarity': data[key][pos]['polarity']} + polarities.append(data[key][pos]['polarity']) + pos_tags.append(pos) + negatives.append(data[key][pos]['negative']) + positives.append(data[key][pos]['positive']) + strengths.append(data[key][pos]['strength']) + + # first we add to the feature vector the subjectivity + if data[key][pos]['strength'] == "strongsubj": + feat_lexicon[key] = [1.0] + elif data[key][pos]['strength'] == "weaksubj": + feat_lexicon[key] = [0.0] + # then, the polarity + feat_lexicon[key].append(float(data[key][pos]['polarity'])) + # then, the positivity + feat_lexicon[key].append(float(data[key][pos]['positive'])) + # and finally the negativity + feat_lexicon[key].append(float(data[key][pos]['positive'])) + # print(len(lexicon)) + + # now it is in the form: { word:{'pos':_, 'positive':_, 'negative':_, 'polarity':_} } + # polarity: -2 to +2 + # pos: 'NOUN', 'ADJ', 'ADV', 'VERB', '_' + # strength: weaksubj or strongsubj + # positive/negative: 0 or 1 + + # the lists are for statistics. total words: 6886 + + return feat_lexicon + diff --git a/slp/load_lexicons/get_nrc_emolex_features.py b/slp/load_lexicons/get_nrc_emolex_features.py new file mode 100755 index 0000000..8dce1ef --- /dev/null +++ b/slp/load_lexicons/get_nrc_emolex_features.py @@ -0,0 +1,27 @@ +import os +import pickle + +# NRC Emotion Lexicon (Emolex) +# Total words: 14,182 +# dictionary: {word: {'fear':_, 'joy':_, 'positive':_, 'emotions':(list of len 8), 'sadness':_, +# 'negative':_, 'anticipation':_, 'polarity':_, 'anger':_, 'disgust':_, 'trust':_, 'surprise':_}} + +BASE_DIR = '../data/' + +def emolex(): + path = os.path.join(BASE_DIR, 'emolex.pickle') + with open(path, 'rb') as f: + data = pickle.load(f) + + lex = {} + for word in data: + features = [] + for key in data[word]: + if not isinstance(data[word][key], list): + features.append(data[word][key]) + else: + features += data[word][key] + lex[word]=features + + + return lex diff --git a/slp/load_lexicons/get_semeval2015_twitter_features.py b/slp/load_lexicons/get_semeval2015_twitter_features.py new file mode 100755 index 0000000..04a2d0c --- /dev/null +++ b/slp/load_lexicons/get_semeval2015_twitter_features.py @@ -0,0 +1,18 @@ +import os +import pickle + +BASE_DIR = '../data/' + +# SemEval-2015 English Twitter Sentiment Lexicon +# aka NRC MaxDiff Twitter Sentiment Lexicon +# Total words: 1515 (including hashtags like #ew) +# dictionary: {word: real value -1 to +1, representing negative/positive sentiment} + +def semeval15_lexicon(): + path = os.path.join(BASE_DIR, 'SemEval2015-English-Twitter-Lexicon.pickle') + with open(path, 'rb') as f: + data = pickle.load(f) + return data + + +# print(len(data)) diff --git a/slp/modules/basic_model.py b/slp/modules/basic_model.py new file mode 100644 index 0000000..0b01a86 --- /dev/null +++ b/slp/modules/basic_model.py @@ -0,0 +1,157 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence + + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, hidden_size=300): + super(WordAttNet, self).__init__() + + self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) + + self.word = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + def forward(self, inputs, lengths, hidden_state, is_title=False): + output_emb = self.lookup(inputs) + +# import pdb; pdb.set_trace() + + output, lengths = self.pack(output_emb ,lengths) + f_output, h_output = self.gru(output.float(), hidden_state) + f_output = self.unpack(f_output, lengths) + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self, hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size, 300, bidirectional=True, batch_first=True) #changed hidden size + + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, features, lengths, titles, hidden_state): +# import pdb; pdb.set_trace() + + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + + titles = torch.unsqueeze(titles, dim=1) + f_output = torch.cat((f_output,titles), dim=1) + + + output = self.sent(f_output) + output = self.context(output) + + #features = torch.unsqueeze(features, dim=2) + + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + +# output = torch.cat((output,features), dim=1) + output = self.fc(output).squeeze() + + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + + self.sent_att_net = SentAttNet(self.hidden_size, num_classes) + self.word_att_net_text = WordAttNet(dict_size, diction, hidden_size) + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, lengths, features, titles, title_lengths): + # inputs = (B, S, W) + text = inputs.permute(1, 0, 2) + all_word_lengths = [] + output_list_text = [] + +# import pdb; pdb.set_trace() + + for i in text: + + word_lengths = i.size(1) - (i==0).sum(dim=1) + if 0 in word_lengths: + for k in range(0, inputs.size()[0]): + if word_lengths[k] == 0: + word_lengths[k] = 1 +# all_word_lengths.append(word_lengths) + output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, + self.word_hidden_state, + is_title=False) #[8,600] + output_list_text.append(output_text) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + + + output_title, self.word_hidden_state = self.word_att_net_text(titles, ### title + title_lengths, + self.word_hidden_state, + is_title=True) #[8,600] + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + # output_list_text = (S, B, 600) + # output_list_text = (S, B, 760) +# import pdb; pdb.set_trace() + + output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, features, lengths, output_title, self.sent_hidden_state) + self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/modules/basic_model_DRNN.py b/slp/modules/basic_model_DRNN.py new file mode 100644 index 0000000..0bff7f0 --- /dev/null +++ b/slp/modules/basic_model_DRNN.py @@ -0,0 +1,199 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence +from slp.modules.drnn import DRNN + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) +2 + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, hidden_size=300): + super(WordAttNet, self).__init__() + + n_layers = 2 + embedding_size = 300 + self.model = DRNN(embedding_size, hidden_size, n_layers, cell_type='GRU') + + + + + #self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) + + self.word = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + def forward(self, inputs, hidden_state, is_title=False): + output_emb = self.lookup(inputs) + + import pdb; pdb.set_trace() + +# output, lengths = self.pack(output_emb ,lengths) + f_output, h_output = self.model(output_emb.float(), hidden_state) +# f_output = self.unpack(f_output, lengths) + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self, hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size, 300, bidirectional=True, batch_first=True) #changed hidden size + + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): +# import pdb; pdb.set_trace() + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + + #titles = torch.unsqueeze(titles, dim=1) + #f_output = torch.cat((f_output,titles), dim=1) + + + output = self.sent(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + output = self.fc(output).squeeze() + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + + n_layers = 2 + n_classes = 2 + embedding_size = 300 + + self.model = DRNN(embedding_size, hidden_size, n_layers, cell_type='GRU', batch_first=True) + + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + + self.linear = nn.Linear(hidden_size, n_classes) + + self.sent = nn.Linear(hidden_size, hidden_size) + self.context = nn.Linear(hidden_size, 1, bias=False) + + + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, lengths, titles, title_lengths): + # inputs = (B, S, W) + text = inputs.permute(1, 0, 2) + all_word_lengths = [] + output_list_text = [] + + import pdb; pdb.set_trace() + batch_size = inputs.size(0) + num_sentences = inputs.size(1) + padded = inputs.size(2) + #inputs = inputs.view(batch_size * num_sentences, -1) + + inputs = inputs.view(batch_size, num_sentences * padded) + + output_emb = self.lookup(inputs) + layer_outputs, self.word_hidden_state = self.model(output_emb.float()) + #self.word_hidden_state) + output_list_text.append(layer_outputs) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + + + + + + preds = [] + for i in range(batch_size): + output = self.sent(layer_outputs[i]) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + pred = self.linear(output) + preds.append(pred) +# pred = self.linear(layer_outputs[-1]) + output = preds + + + +#----- +# for i in text: + +# word_lengths = i.size(1) - (i==0).sum(dim=1) +# if 0 in word_lengths: +# for k in range(0, inputs.size()[0]): +# if word_lengths[k] == 0: +# word_lengths[k] = 1 +### all_word_lengths.append(word_lengths) +# output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, +# self.word_hidden_state, +# is_title=False) #[8,600] +# output_list_text.append(output_text) +# self.word_hidden_state = repackage_hidden(self.word_hidden_state) +#----- + + + + # output_list_text = (S, B, 600) + # output_list_text = (S, B, 760) + import pdb; pdb.set_trace() + + +# output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) +# output, self.sent_hidden_state = self.sent_att_net(output_list_text, lengths, self.sent_hidden_state) +# self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/modules/drnn.py b/slp/modules/drnn.py new file mode 100644 index 0000000..886cb33 --- /dev/null +++ b/slp/modules/drnn.py @@ -0,0 +1,131 @@ +import torch +import torch.nn as nn + + +use_cuda = torch.cuda.is_available() + + +class DRNN(nn.Module): + + def __init__(self, n_input, n_hidden, n_layers, dropout=0, cell_type='GRU', batch_first=False): + super(DRNN, self).__init__() + + self.dilations = [2 ** i for i in range(n_layers)] + self.cell_type = cell_type + self.batch_first = batch_first + + layers = [] + if self.cell_type == "GRU": + cell = nn.GRU + elif self.cell_type == "RNN": + cell = nn.RNN + elif self.cell_type == "LSTM": + cell = nn.LSTM + else: + raise NotImplementedError + + for i in range(n_layers): + if i == 0: + c = cell(n_input, n_hidden, dropout=dropout) + else: + c = cell(n_hidden, n_hidden, dropout=dropout) + layers.append(c) + self.cells = nn.Sequential(*layers) + + def forward(self, inputs, hidden=None): + +# import pdb; pdb.set_trace() + + if self.batch_first: + inputs = inputs.transpose(0, 1) + outputs = [] + for i, (cell, dilation) in enumerate(zip(self.cells, self.dilations)): + if hidden is None: + inputs, _ = self.drnn_layer(cell, inputs, dilation) + else: + inputs, hidden[i] = self.drnn_layer(cell, inputs, dilation, hidden[i]) + + outputs.append(inputs[-dilation:]) + + if self.batch_first: + inputs = inputs.transpose(0, 1) + return inputs, outputs + + def drnn_layer(self, cell, inputs, rate, hidden=None): + n_steps = len(inputs) + batch_size = inputs[0].size(0) + hidden_size = cell.hidden_size + + inputs, _ = self._pad_inputs(inputs, n_steps, rate) + dilated_inputs = self._prepare_inputs(inputs, rate) + + if hidden is None: + dilated_outputs, hidden = self._apply_cell(dilated_inputs, cell, batch_size, rate, hidden_size) + else: + hidden = self._prepare_inputs(hidden, rate) + dilated_outputs, hidden = self._apply_cell(dilated_inputs, cell, batch_size, rate, hidden_size, hidden=hidden) + + splitted_outputs = self._split_outputs(dilated_outputs, rate) + outputs = self._unpad_outputs(splitted_outputs, n_steps) + + return outputs, hidden + + def _apply_cell(self, dilated_inputs, cell, batch_size, rate, hidden_size, hidden=None): + if hidden is None: + if self.cell_type == 'LSTM': + c, m = self.init_hidden(batch_size * rate, hidden_size) + hidden = (c.unsqueeze(0), m.unsqueeze(0)) + else: + hidden = self.init_hidden(batch_size * rate, hidden_size).unsqueeze(0) + + dilated_outputs, hidden = cell(dilated_inputs, hidden) + + return dilated_outputs, hidden + + def _unpad_outputs(self, splitted_outputs, n_steps): + return splitted_outputs[:n_steps] + + def _split_outputs(self, dilated_outputs, rate): + batchsize = dilated_outputs.size(1) // rate + + blocks = [dilated_outputs[:, i * batchsize: (i + 1) * batchsize, :] for i in range(rate)] + + interleaved = torch.stack((blocks)).transpose(1, 0).contiguous() + interleaved = interleaved.view(dilated_outputs.size(0) * rate, + batchsize, + dilated_outputs.size(2)) + return interleaved + + def _pad_inputs(self, inputs, n_steps, rate): + is_even = (n_steps % rate) == 0 + + if not is_even: + dilated_steps = n_steps // rate + 1 + + zeros_ = torch.zeros(dilated_steps * rate - inputs.size(0), + inputs.size(1), + inputs.size(2)) + if use_cuda: + zeros_ = zeros_.cuda() + + inputs = torch.cat((inputs, zeros_)) + else: + dilated_steps = n_steps // rate + + return inputs, dilated_steps + + def _prepare_inputs(self, inputs, rate): + dilated_inputs = torch.cat([inputs[j::rate, :, :] for j in range(rate)], 1) + return dilated_inputs + + def init_hidden(self, batch_size, hidden_dim): + hidden = torch.zeros(batch_size, hidden_dim) + if use_cuda: + hidden = hidden.cuda() + if self.cell_type == "LSTM": + memory = torch.zeros(batch_size, hidden_dim) + if use_cuda: + memory = memory.cuda() + return (hidden, memory) + else: + return hidden diff --git a/slp/modules/helpers.py b/slp/modules/helpers.py index 334c1c3..908fa83 100644 --- a/slp/modules/helpers.py +++ b/slp/modules/helpers.py @@ -10,6 +10,7 @@ def __init__(self, batch_first=True): self.batch_first = batch_first def forward(self, x, lengths): +# import pdb; pdb.set_trace() max_length = lengths.max().item() x, _ = pad_packed_sequence( x, batch_first=self.batch_first, total_length=max_length) @@ -22,6 +23,7 @@ def __init__(self, batch_first=True): self.batch_first = batch_first def forward(self, x, lengths): +# import pdb; pdb.set_trace() x = pack_padded_sequence( x, lengths, batch_first=self.batch_first, diff --git a/slp/modules/hier_att_net.py b/slp/modules/hier_att_net.py new file mode 100644 index 0000000..9020e6d --- /dev/null +++ b/slp/modules/hier_att_net.py @@ -0,0 +1,125 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, hidden_size=300): + super(WordAttNet, self).__init__() + + self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) + + self.word = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): + output = self.lookup(inputs) + + output, lengths = self.pack(output,lengths) + f_output, h_output = self.gru(output.float(), hidden_state) + f_output = self.unpack(f_output, lengths) + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self,hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size, 300, bidirectional=True, batch_first=True) + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + + output = self.sent(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + output = self.fc(output).squeeze() + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + + self.sent_att_net = SentAttNet(self.hidden_size, num_classes) + self.word_att_net_text = WordAttNet(dict_size, diction, hidden_size) + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, lengths): + # inputs = (B, S, W) + output_list_text = [] + text = inputs.permute(1, 0, 2) + for i in text: + + word_lengths = i.size(1) - (i==0).sum(dim=1) + if 0 in word_lengths: + for k in range(0, inputs.size()[0]): + if word_lengths[k] == 0: + word_lengths[k] = 1 + + output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, self.word_hidden_state) #[8,600] + output_list_text.append(output_text) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + # output_list_text = (S, B, 600) + + output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, lengths, self.sent_hidden_state) + self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/modules/hier_att_net_title.py b/slp/modules/hier_att_net_title.py new file mode 100644 index 0000000..4b1c3cc --- /dev/null +++ b/slp/modules/hier_att_net_title.py @@ -0,0 +1,138 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence +from slp.load_lexicons.get_all_lexicons import LexiconFeatures + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, hidden_size=300): + super(WordAttNet, self).__init__() + + self.gru = nn.GRU(380, 380, bidirectional = True, batch_first=True) #changed hidden & input size. + + self.word = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + self.lexicons = LexiconFeatures() + + def forward(self, inputs, lengths, hidden_state, idx2word): + output = self.lookup(inputs) + + output_lex = self.lexicons(inputs, idx2word, padding_len=output.shape[1]).float() + + output = torch.cat((output, output_lex), axis=2) #to concatenation tis eisodou me ta lexica + + output, lengths = self.pack(output,lengths) + f_output, h_output = self.gru(output.float(), hidden_state) + f_output = self.unpack(f_output, lengths) + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self,hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size, 380, bidirectional=True, batch_first=True) #changed hidden size + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, titles, hidden_state): + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + + titles = torch.unsqueeze(titles, dim=1) + f_output = torch.cat((f_output,titles), dim=1) + + + output = self.sent(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + output = self.fc(output).squeeze() + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction, idx2word): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + self.idx2word = idx2word + + self.sent_att_net = SentAttNet(self.hidden_size, num_classes) + self.word_att_net_text = WordAttNet(dict_size, diction, hidden_size) + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, lengths, titles, title_lengths): + # inputs = (B, S, W) + output_list_text = [] + text = inputs.permute(1, 0, 2) + for i in text: + + word_lengths = i.size(1) - (i==0).sum(dim=1) + if 0 in word_lengths: + for k in range(0, inputs.size()[0]): + if word_lengths[k] == 0: + word_lengths[k] = 1 + + output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, self.word_hidden_state, self.idx2word) #[8,600] + output_list_text.append(output_text) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + output_title, self.word_hidden_state = self.word_att_net_text(titles, title_lengths, self.word_hidden_state, self.idx2word) #[8,600] + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + # output_list_text = (S, B, 600) + # output_list_text = (S, B, 760) +# import pdb; pdb.set_trace() + output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, lengths, output_title, self.sent_hidden_state) + self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/modules/hier_att_net_title_attentional_embed.py b/slp/modules/hier_att_net_title_attentional_embed.py new file mode 100644 index 0000000..199b2aa --- /dev/null +++ b/slp/modules/hier_att_net_title_attentional_embed.py @@ -0,0 +1,171 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence +from slp.load_lexicons.get_all_6lexicons import LexiconFeatures + +DEVICE = 'cpu' +#DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, lex_size, hidden_size=300): + super(WordAttNet, self).__init__() + + self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) #changed hidden & input size. + + self.word = nn.Linear(2 * hidden_size + lex_size, 2 * hidden_size + lex_size) + self.context = nn.Linear(2 * hidden_size + lex_size, 1, bias=False) + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + self.lexicons = LexiconFeatures() + + def forward(self, inputs, lengths, hidden_state, idx2word, lex_size, is_title=False): + output_emb = self.lookup(inputs) + + import pdb; pdb.set_trace() + + output, lengths = self.pack(output_emb ,lengths) + try: + f_output, h_output = self.gru(output.float(), hidden_state) + except: + for i in output[0]: + if i.shape[0]!=300: + import pdb; pdb.set_trace() + f_output = self.unpack(f_output, lengths, max_length = 150) + +# f_output = f_output.view(batch_size, num_sentences, -1) + + if is_title == False: + output_lex = self.lexicons(inputs, idx2word, padding_len=f_output.shape[1]).float() + f_output = torch.cat((f_output, output_lex), axis=2) + else: + import pdb; pdb.set_trace() + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self, lex_size, hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size + lex_size, 300, bidirectional=True, batch_first=True) #changed hidden size + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): +# import pdb; pdb.set_trace() + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + +# titles = torch.unsqueeze(titles, dim=1) #try without title +# f_output = torch.cat((f_output,titles), dim=1) + + + output = self.sent(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + output = self.fc(output).squeeze() + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction, idx2word, lex_size): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + self.idx2word = idx2word + self.lex_size = lex_size + + self.sent_att_net = SentAttNet(lex_size, self.hidden_size, num_classes) + self.word_att_net_text = WordAttNet(dict_size, diction, lex_size, hidden_size) + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, number_of_sentences, length_of_sentences, titles, title_lengths): + # inputs = (B, S, W) + import pdb; pdb.set_trace() + text = inputs.permute(1, 0, 2) + all_word_lengths = [] + + batch_size = inputs.size(0) + num_sentences = inputs.size(1) + inputs = inputs.view(batch_size * num_sentences, -1) + length_of_sentences = length_of_sentences.view(-1) + + temp = inputs[:100] + temp_lengths = length_of_sentences[:100] + + output_text, self.word_hidden_state = self.word_att_net_text(temp, temp_lengths, self.word_hidden_state, self.idx2word, self.lex_size, is_title=False) + + +# ----- +# for i in text: + +# word_lengths = i.size(1) - (i==0).sum(dim=1) # to mikos tis kathe protasis apo tis 8 tou batch kathe fora +# if 0 in word_lengths: +# for k in range(0, inputs.size()[0]): +# if word_lengths[k] == 0: +# word_lengths[k] = 1 +# all_word_lengths.append(word_lengths) +# output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, self.word_hidden_state, self.idx2word, self.lex_size, is_title=False) #[8,600] +# output_list_text.append(output_text) +# self.word_hidden_state = repackage_hidden(self.word_hidden_state) + +# ----- + + +# output_title, self.word_hidden_state = self.word_att_net_text(titles, title_lengths, # try without title +# self.word_hidden_state, self.idx2word, +# self.lex_size, is_title=True) #[8,600] +# self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + # output_list_text = (S, B, 600) + # output_list_text = (S, B, 760) +# import pdb; pdb.set_trace() + + output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, number_of_sentences, self.sent_hidden_state) + self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/modules/hier_att_net_title_attentional_embed2.py b/slp/modules/hier_att_net_title_attentional_embed2.py new file mode 100644 index 0000000..a9cb4d6 --- /dev/null +++ b/slp/modules/hier_att_net_title_attentional_embed2.py @@ -0,0 +1,149 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from slp.modules.helpers import PackSequence, PadPackedSequence +from slp.data.therapy import pad_sequence +from slp.load_lexicons.get_all_6lexicons import LexiconFeatures + +#DEVICE = 'cpu' +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + + +def repackage_hidden(h): + """Wraps hidden states in new Tensors, to detach them from their history.""" + + if isinstance(h, torch.Tensor): + return h.detach() + else: + return tuple(repackage_hidden(v) for v in h) + + +class WordAttNet(nn.Module): + def __init__(self, dict_size, diction, lex_size, hidden_size=300): + super(WordAttNet, self).__init__() + + self.gru = nn.GRU(300, 300, bidirectional = True, batch_first=True) #changed hidden & input size. + + self.word = nn.Linear(2 * hidden_size + lex_size, 2 * hidden_size + lex_size) + self.context = nn.Linear(2 * hidden_size + lex_size, 1, bias=False) + self.diction = diction + self.dict_size = dict_size + self.lookup = nn.Embedding(num_embeddings = self.dict_size, embedding_dim =300).from_pretrained(self.diction) + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + self.lexicons = LexiconFeatures() + + def forward(self, inputs, lengths, hidden_state, idx2word, lex_size, is_title=False): + output_emb = self.lookup(inputs) + +# import pdb; pdb.set_trace() + + output, lengths = self.pack(output_emb ,lengths) + f_output, h_output = self.gru(output.float(), hidden_state) + f_output = self.unpack(f_output, lengths) + + if is_title == False: + output_lex = self.lexicons(inputs, idx2word, padding_len=f_output.shape[1]).float() + f_output = torch.cat((f_output, output_lex), axis=2) + else: + import pdb; pdb.set_trace() + + output = self.word(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + return output, h_output + + +class SentAttNet(nn.Module): + def __init__(self, lex_size, hidden_size=300, num_classes=0): + super(SentAttNet, self).__init__() + num_classes = num_classes + self.gru = nn.GRU(2 * hidden_size + lex_size, 300, bidirectional=True, batch_first=True) #changed hidden size + + self.sent = nn.Linear(2 * hidden_size, 2 * hidden_size) + self.context = nn.Linear(2 * hidden_size, 1, bias=False) + self.fc = nn.Linear(2 * hidden_size, num_classes) + + self.pack = PackSequence(batch_first=True) + self.unpack = PadPackedSequence(batch_first=True) + + + def forward(self, inputs, lengths, hidden_state): +# import pdb; pdb.set_trace() + f_output, lengths = self.pack(inputs, lengths) + f_output, h_output = self.gru(f_output, hidden_state) + f_output = self.unpack(f_output, lengths) + +# titles = torch.unsqueeze(titles, dim=1) #try without title +# f_output = torch.cat((f_output,titles), dim=1) + + + output = self.sent(f_output) + output = self.context(output) + output = F.softmax(output, dim=1) + output = (f_output * output).sum(1) + output = self.fc(output).squeeze() + return output, h_output + + +class HierAttNet(nn.Module): + def __init__(self, hidden_size, batch_size, num_classes, max_sent_len, dict_size, diction, idx2word, lex_size): + super (HierAttNet, self).__init__() + + self.num_classes = num_classes + self.batch_size = batch_size + self.hidden_size = hidden_size + self.idx2word = idx2word + self.lex_size = lex_size + + self.sent_att_net = SentAttNet(lex_size, self.hidden_size, num_classes) + self.word_att_net_text = WordAttNet(dict_size, diction, lex_size, hidden_size) + + self.max_sent_len = max_sent_len + + self._init_hidden_state() + + def _init_hidden_state(self, last_batch_size=None): + if last_batch_size: + batch_size = last_batch_size + else: + batch_size = self.batch_size + self.word_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.sent_hidden_state = torch.zeros(2, batch_size, self.hidden_size) + self.word_hidden_state = self.word_hidden_state.to(DEVICE) + self.sent_hidden_state = self.sent_hidden_state.to(DEVICE) + + def forward(self, inputs, lengths, titles, title_lengths): + # inputs = (B, S, W) + import pdb; pdb.set_trace() + text = inputs.permute(1, 0, 2) + all_word_lengths = [] + for i in text: + + word_lengths = i.size(1) - (i==0).sum(dim=1) + if 0 in word_lengths: + for k in range(0, inputs.size()[0]): + if word_lengths[k] == 0: + word_lengths[k] = 1 + all_word_lengths.append(word_lengths) + output_text, self.word_hidden_state = self.word_att_net_text(i, word_lengths, self.word_hidden_state, self.idx2word, self.lex_size, is_title=False) #[8,600] + output_list_text.append(output_text) + self.word_hidden_state = repackage_hidden(self.word_hidden_state) + +# output_title, self.word_hidden_state = self.word_att_net_text(titles, title_lengths, # try without title +# self.word_hidden_state, self.idx2word, +# self.lex_size, is_title=True) #[8,600] +# self.word_hidden_state = repackage_hidden(self.word_hidden_state) + + # output_list_text = (S, B, 600) + # output_list_text = (S, B, 760) +# import pdb; pdb.set_trace() + + output_list_text = pad_sequence(output_list_text, padding_len=self.batch_size) + output, self.sent_hidden_state = self.sent_att_net(output_list_text, lengths, self.sent_hidden_state) + self.sent_hidden_state = repackage_hidden(self.sent_hidden_state) + return output + + diff --git a/slp/trainer/trainer.py b/slp/trainer/trainer.py index d3d517b..f4ee265 100644 --- a/slp/trainer/trainer.py +++ b/slp/trainer/trainer.py @@ -154,8 +154,10 @@ def train_step(self: TrainerType, engine: Engine, batch: List[torch.Tensor]) -> float: self.model.train() + # import pdb; pdb.set_trace() + y_pred, targets = self.get_predictions_and_targets(batch) - loss = self.loss_fn(y_pred, targets) # type: ignore + loss = self.loss_fn(y_pred, targets.long()) # type: ignore if self.parallel: loss = loss.mean() loss = loss / self.accumulation_steps @@ -172,6 +174,7 @@ def eval_step( batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: self.model.eval() with torch.no_grad(): + #import pdb; pdb.set_trace() y_pred, targets = self.get_predictions_and_targets(batch) return y_pred, targets @@ -197,6 +200,13 @@ def fit(self: TrainerType, validation=True) self.model.zero_grad() self.trainer.run(train_loader, max_epochs=epochs) + best_score = (-self.early_stop.best_score + if self.early_stop + else self.valid_evaluator.state.metrics['loss']) + return best_score + + + def overfit_single_batch(self: TrainerType, train_loader: DataLoader) -> State: @@ -267,6 +277,7 @@ def parse_batch( return inputs, inputs + class SequentialTrainer(Trainer): def parse_batch( self, @@ -274,22 +285,87 @@ def parse_batch( inputs = to_device(batch[0], device=self.device, non_blocking=self.non_blocking) - targets = to_device(batch[1], + titles = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + len_inputs = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + len_titles = to_device(batch[4], + device=self.device, + non_blocking=self.non_blocking) + return inputs, titles, targets, len_inputs, len_titles + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, titles, targets, len_inputs, len_titles = self.parse_batch(batch) + y_pred = self.model(inputs, len_inputs) + import pdb; pdb.set_trace() + return y_pred, targets + + +class SequentialTrainerTitle(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + titles = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[2], device=self.device, non_blocking=self.non_blocking) - lengths = to_device(batch[2], + len_inputs = to_device(batch[3], device=self.device, non_blocking=self.non_blocking) - return inputs, targets, lengths + len_titles = to_device(batch[4], + device=self.device, + non_blocking=self.non_blocking) + return inputs, titles, targets, len_inputs, len_titles def get_predictions_and_targets( self, batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: - inputs, targets, lengths = self.parse_batch(batch) - y_pred = self.model(inputs, lengths) + inputs, titles, targets, len_inputs, len_titles = self.parse_batch(batch) + y_pred = self.model(inputs, len_inputs) + import pdb; pdb.set_trace() return y_pred, targets +class BertTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + masks = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + segments = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + return inputs, targets, masks, segments + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, targets, masks, segments = self.parse_batch(batch) + #import pdb; pdb.set_trace() + logits = self.model(inputs, token_type_ids=segments, attention_mask=masks) + + return logits, targets + + class Seq2seqTrainer(SequentialTrainer): def parse_batch( self, diff --git a/slp/trainer/trainer_title.py b/slp/trainer/trainer_title.py new file mode 100644 index 0000000..f944752 --- /dev/null +++ b/slp/trainer/trainer_title.py @@ -0,0 +1,351 @@ +import os +from typing import Union +import torch +import torch.nn as nn + +from ignite.handlers import EarlyStopping +from ignite.contrib.handlers import ProgressBar +from ignite.engine import Engine, Events, State +from ignite.metrics import RunningAverage, Loss + +from torch.optim.optimizer import Optimizer +from torch.nn.modules.loss import _Loss +from torch.utils.data import DataLoader + +from typing import cast, List, Optional, Tuple, TypeVar +from slp.util import types +from slp.util.parallel import DataParallelModel, DataParallelCriterion + +from slp.trainer.handlers import CheckpointHandler, EvaluationHandler +from slp.util import from_checkpoint, to_device +from slp.util import log +from slp.util import system + + +TrainerType = TypeVar('TrainerType', bound='Trainer') + + +class Trainer(object): + def __init__(self: TrainerType, + model: nn.Module, + optimizer: Optimizer, + checkpoint_dir: str = '../../checkpoints', + experiment_name: str = 'experiment', + model_checkpoint: Optional[str] = None, + optimizer_checkpoint: Optional[str] = None, + metrics: types.GenericDict = None, + patience: int = 10, + validate_every: int = 1, + accumulation_steps: int = 1, + loss_fn: Union[_Loss, DataParallelCriterion] = None, + non_blocking: bool = True, + retain_graph: bool = False, + dtype: torch.dtype = torch.float, + device: str = 'cpu', + parallel: bool = False) -> None: + self.dtype = dtype + self.retain_graph = retain_graph + self.non_blocking = non_blocking + self.device = device + self.loss_fn = loss_fn + self.validate_every = validate_every + self.patience = patience + self.accumulation_steps = accumulation_steps + self.checkpoint_dir = checkpoint_dir + + model_checkpoint = self._check_checkpoint(model_checkpoint) + optimizer_checkpoint = self._check_checkpoint(optimizer_checkpoint) + + self.model = cast(nn.Module, from_checkpoint( + model_checkpoint, model, map_location=torch.device('cpu'))) + self.model = self.model.type(dtype).to(device) + self.optimizer = from_checkpoint(optimizer_checkpoint, optimizer) + self.parallel = parallel + if parallel: + if device == 'cpu': + raise ValueError("parallel can be used only with cuda device") + self.model = DataParallelModel(self.model).to(device) + self.loss_fn = DataParallelCriterion(self.loss_fn) # type: ignore + if metrics is None: + metrics = {} + if 'loss' not in metrics: + if self.parallel: + metrics['loss'] = Loss( + lambda x, y: self.loss_fn(x, y).mean()) # type: ignore + else: + metrics['loss'] = Loss(self.loss_fn) + self.trainer = Engine(self.train_step) + self.train_evaluator = Engine(self.eval_step) + self.valid_evaluator = Engine(self.eval_step) + for name, metric in metrics.items(): + metric.attach(self.train_evaluator, name) + metric.attach(self.valid_evaluator, name) + + self.pbar = ProgressBar() + self.val_pbar = ProgressBar(desc='Validation') + + if checkpoint_dir is not None: + self.checkpoint = CheckpointHandler( + checkpoint_dir, experiment_name, score_name='validation_loss', + score_function=self._score_fn, n_saved=2, + require_empty=False, save_as_state_dict=True) + + self.early_stop = EarlyStopping( + patience, self._score_fn, self.trainer) + + self.val_handler = EvaluationHandler(pbar=self.pbar, + validate_every=1, + early_stopping=self.early_stop) + self.attach() + log.info( + f'Trainer configured to run {experiment_name}\n' + f'\tpretrained model: {model_checkpoint} {optimizer_checkpoint}\n' + f'\tcheckpoint directory: {checkpoint_dir}\n' + f'\tpatience: {patience}\n' + f'\taccumulation steps: {accumulation_steps}\n' + f'\tnon blocking: {non_blocking}\n' + f'\tretain graph: {retain_graph}\n' + f'\tdevice: {device}\n' + f'\tmodel dtype: {dtype}\n' + f'\tparallel: {parallel}') + + def _check_checkpoint(self: TrainerType, + ckpt: Optional[str]) -> Optional[str]: + if ckpt is None: + return ckpt + if system.is_url(ckpt): + ckpt = system.download_url(cast(str, ckpt), self.checkpoint_dir) + ckpt = os.path.join(self.checkpoint_dir, ckpt) + return ckpt + + @staticmethod + def _score_fn(engine: Engine) -> float: + """Returns the scoring metric for checkpointing and early stopping + + Args: + engine (ignite.engine.Engine): The engine that calculates + the val loss + + Returns: + (float): The validation loss + """ + negloss: float = -engine.state.metrics['loss'] + return negloss + + def parse_batch( + self: TrainerType, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + return inputs, targets + + def get_predictions_and_targets( + self: TrainerType, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, targets = self.parse_batch(batch) + y_pred = self.model(inputs) + return y_pred, targets + + def train_step(self: TrainerType, + engine: Engine, + batch: List[torch.Tensor]) -> float: + self.model.train() + #import pdb; pdb.set_trace() + y_pred, targets = self.get_predictions_and_targets(batch) + loss = self.loss_fn(y_pred, targets.long()) # type: ignore + if self.parallel: + loss = loss.mean() + loss = loss / self.accumulation_steps + loss.backward(retain_graph=self.retain_graph) + if (self.trainer.state.iteration + 1) % self.accumulation_steps == 0: + self.optimizer.step() # type: ignore + self.optimizer.zero_grad() + loss_value: float = loss.item() + return loss_value + + def eval_step( + self: TrainerType, + engine: Engine, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + self.model.eval() + with torch.no_grad(): + y_pred, targets = self.get_predictions_and_targets(batch) + return y_pred, targets + + def predict(self: TrainerType, dataloader: DataLoader) -> State: + return self.valid_evaluator.run(dataloader) + + def fit(self: TrainerType, + train_loader: DataLoader, + val_loader: DataLoader, + epochs: int = 50) -> State: + log.info( + 'Trainer will run for\n' + f'model: {self.model}\n' + f'optimizer: {self.optimizer}\n' + f'loss: {self.loss_fn}') + self.val_handler.attach(self.trainer, + self.train_evaluator, + train_loader, + validation=False) + self.val_handler.attach(self.trainer, + self.valid_evaluator, + val_loader, + validation=True) + self.model.zero_grad() + self.trainer.run(train_loader, max_epochs=epochs) + best_score = (-self.early_stop.best_score + if self.early_stop + else self.valid_evaluator.state.metrics['loss']) + return best_score + + + + + def overfit_single_batch(self: TrainerType, + train_loader: DataLoader) -> State: + single_batch = [next(iter(train_loader))] + + if self.trainer.has_event_handler(self.val_handler, Events.EPOCH_COMPLETED): + self.trainer.remove_event_handler(self.val_handler, Events.EPOCH_COMPLETED) + + self.val_handler.attach(self.trainer, + self.train_evaluator, + single_batch, # type: ignore + validation=False) + out = self.trainer.run(single_batch, max_epochs=100) + return out + + def fit_debug(self: TrainerType, + train_loader: DataLoader, + val_loader: DataLoader) -> State: + train_loader = iter(train_loader) + train_subset = [next(train_loader), next(train_loader)] + val_loader = iter(val_loader) # type: ignore + val_subset = [next(val_loader), next(val_loader)] # type ignore + out = self.fit(train_subset, val_subset, epochs=6) # type: ignore + return out + + def _attach_checkpoint(self: TrainerType) -> TrainerType: + ckpt = { + 'model': self.model, + 'optimizer': self.optimizer + } + if self.checkpoint_dir is not None: + self.valid_evaluator.add_event_handler( + Events.COMPLETED, self.checkpoint, ckpt) + return self + + + def attach(self: TrainerType) -> TrainerType: + ra = RunningAverage(output_transform=lambda x: x) + ra.attach(self.trainer, "Train Loss") + self.pbar.attach(self.trainer, ['Train Loss']) + self.val_pbar.attach(self.train_evaluator) + self.val_pbar.attach(self.valid_evaluator) + self.valid_evaluator.add_event_handler(Events.COMPLETED, + self.early_stop) + self = self._attach_checkpoint() + def graceful_exit(engine, e): + if isinstance(e, KeyboardInterrupt): + engine.terminate() + log.warn("CTRL-C caught. Exiting gracefully...") + else: + raise(e) + + self.trainer.add_event_handler(Events.EXCEPTION_RAISED, graceful_exit) + self.train_evaluator.add_event_handler(Events.EXCEPTION_RAISED, + graceful_exit) + self.valid_evaluator.add_event_handler(Events.EXCEPTION_RAISED, + graceful_exit) + return self + + +class AutoencoderTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + return inputs, inputs + + +class SequentialTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + titles = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + lengths = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + title_lengths = to_device(batch[4], + device=self.device, + non_blocking=self.non_blocking) + + return inputs, titles, targets, lengths, title_lengths + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, titles, targets, lengths, title_lengths = self.parse_batch(batch) + # import pdb; pdb.set_trace() + y_pred = self.model(inputs, lengths, titles, title_lengths) + return y_pred, targets + + +class Seq2seqTrainer(SequentialTrainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + lengths = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + return inputs, inputs, lengths + + +class TransformerTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + mask_inputs = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + mask_targets = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + return inputs, targets, mask_inputs, mask_targets + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, targets, mask_inputs, mask_targets = self.parse_batch(batch) + y_pred = self.model(inputs, + targets, + source_mask=mask_inputs, + target_mask=mask_targets) + targets = targets.view(-1) + y_pred = y_pred.view(targets.size(0), -1) + # TODO: BEAMSEARCH!! + return y_pred, targets diff --git a/slp/trainer/trainer_title_no_validation.py b/slp/trainer/trainer_title_no_validation.py new file mode 100644 index 0000000..ed38ed0 --- /dev/null +++ b/slp/trainer/trainer_title_no_validation.py @@ -0,0 +1,404 @@ +import os +from typing import Union +import torch +import torch.nn as nn + +from ignite.handlers import EarlyStopping +from ignite.contrib.handlers import ProgressBar +from ignite.engine import Engine, Events, State +from ignite.metrics import RunningAverage, Loss + +from torch.optim.optimizer import Optimizer +from torch.nn.modules.loss import _Loss +from torch.utils.data import DataLoader + +from sklearn.metrics import f1_score +from typing import cast, List, Optional, Tuple, TypeVar +from slp.util import types +from slp.util.parallel import DataParallelModel, DataParallelCriterion + +from slp.trainer.handlers import CheckpointHandler, EvaluationHandler +from slp.util import from_checkpoint, to_device +from slp.util import log +from slp.util import system + + +TrainerType = TypeVar('TrainerType', bound='Trainer') + + +class Trainer(object): + def __init__(self: TrainerType, + model: nn.Module, + optimizer: Optimizer, + checkpoint_dir: str = '../../checkpoints', + experiment_name: str = 'experiment', + model_checkpoint: Optional[str] = None, + optimizer_checkpoint: Optional[str] = None, + metrics: types.GenericDict = None, + patience: int = 10, + validate_every: int = 1, + accumulation_steps: int = 1, + loss_fn: Union[_Loss, DataParallelCriterion] = None, + non_blocking: bool = True, + retain_graph: bool = False, + dtype: torch.dtype = torch.float, + device: str = 'cpu', + parallel: bool = False) -> None: + self.dtype = dtype + self.retain_graph = retain_graph + self.non_blocking = non_blocking + self.device = device + self.loss_fn = loss_fn + self.validate_every = validate_every + self.patience = patience + self.accumulation_steps = accumulation_steps + self.checkpoint_dir = checkpoint_dir + +# import pdb; pdb.set_trace() + + model_checkpoint = self._check_checkpoint(model_checkpoint) + optimizer_checkpoint = self._check_checkpoint(optimizer_checkpoint) + + + self.model = cast(nn.Module, from_checkpoint( + model_checkpoint, model, map_location=torch.device('cpu'))) + self.model = self.model.type(dtype).to(device) + self.optimizer = from_checkpoint(optimizer_checkpoint, optimizer) + self.parallel = parallel + if parallel: + if device == 'cpu': + raise ValueError("parallel can be used only with cuda device") + self.model = DataParallelModel(self.model).to(device) + self.loss_fn = DataParallelCriterion(self.loss_fn) # type: ignore + if metrics is None: + metrics = {} + if 'loss' not in metrics: + if self.parallel: + metrics['loss'] = Loss( + lambda x, y: self.loss_fn(x, y).mean()) # type: ignore + else: + metrics['loss'] = Loss(self.loss_fn) + self.trainer = Engine(self.train_step) + #self.train_evaluator = Engine(self.eval_step) + self.valid_evaluator = Engine(self.eval_step) + for name, metric in metrics.items(): + #metric.attach(self.train_evaluator, name) + metric.attach(self.valid_evaluator, name) + + self.pbar = ProgressBar() + self.val_pbar = ProgressBar(desc='Validation') + + if checkpoint_dir is not None: + self.checkpoint = CheckpointHandler( + checkpoint_dir, experiment_name, score_name='validation_loss', + score_function=self._score_fn, n_saved=2, + require_empty=False, save_as_state_dict=True) + + self.early_stop = EarlyStopping( + patience, self._score_fn, self.trainer) + + self.val_handler = EvaluationHandler(pbar=self.pbar, + validate_every=1, + early_stopping=self.early_stop) + self.attach() + log.info( + f'Trainer configured to run {experiment_name}\n' + f'\tpretrained model: {model_checkpoint} {optimizer_checkpoint}\n' + f'\tcheckpoint directory: {checkpoint_dir}\n' + f'\tpatience: {patience}\n' + f'\taccumulation steps: {accumulation_steps}\n' + f'\tnon blocking: {non_blocking}\n' + f'\tretain graph: {retain_graph}\n' + f'\tdevice: {device}\n' + f'\tmodel dtype: {dtype}\n' + f'\tparallel: {parallel}') + + def _check_checkpoint(self: TrainerType, + ckpt: Optional[str]) -> Optional[str]: + if ckpt is None: + return ckpt + if system.is_url(ckpt): + ckpt = system.download_url(cast(str, ckpt), self.checkpoint_dir) + ckpt = os.path.join(self.checkpoint_dir, ckpt) + return ckpt + + @staticmethod + def _score_fn(engine: Engine) -> float: + """Returns the scoring metric for checkpointing and early stopping + + Args: + engine (ignite.engine.Engine): The engine that calculates + the val loss + + Returns: + (float): The validation loss + """ + negloss: float = -engine.state.metrics['loss'] + return negloss + + def parse_batch( + self: TrainerType, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + return inputs, targets + + def get_predictions_and_targets( + self: TrainerType, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, targets = self.parse_batch(batch) + y_pred = self.model(inputs) + return y_pred, targets + + def train_step(self: TrainerType, + engine: Engine, + batch: List[torch.Tensor]) -> float: + self.model.train() +# import pdb; pdb.set_trace() + y_pred, targets = self.get_predictions_and_targets(batch) + + loss = self.loss_fn(y_pred, targets.long()) # type: ignore + if self.parallel: + loss = loss.mean() + loss = loss / self.accumulation_steps + loss.backward(retain_graph=self.retain_graph) + if (self.trainer.state.iteration + 1) % self.accumulation_steps == 0: + self.optimizer.step() # type: ignore + self.optimizer.zero_grad() + loss_value: float = loss.item() + return loss_value + + def eval_step( + self: TrainerType, + engine: Engine, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + self.model.eval() + with torch.no_grad(): +# import pdb; pdb.set_trace() + y_pred, targets = self.get_predictions_and_targets(batch) + +# f1 = f1_score(targets, y_pred, average='macro') +# print(f1) + + return y_pred, targets + + def predict(self: TrainerType, dataloader: DataLoader) -> State: + return self.valid_evaluator.run(dataloader) + + def fit(self: TrainerType, + train_loader: DataLoader, + val_loader: DataLoader, + epochs: int = 50) -> State: + log.info( + 'Trainer will run for\n' + f'model: {self.model}\n' + f'optimizer: {self.optimizer}\n' + f'loss: {self.loss_fn}') +# self.val_handler.attach(self.trainer, +# self.train_evaluator, +# train_loader, +# validation=False) + self.val_handler.attach(self.trainer, + self.valid_evaluator, + val_loader, + validation=True) + self.model.zero_grad() + self.trainer.run(train_loader, max_epochs=epochs) + best_score = (-self.early_stop.best_score + if self.early_stop + else self.valid_evaluator.state.metrics['loss']) + return best_score + + + + + def overfit_single_batch(self: TrainerType, + train_loader: DataLoader) -> State: + single_batch = [next(iter(train_loader))] + + if self.trainer.has_event_handler(self.val_handler, Events.EPOCH_COMPLETED): + self.trainer.remove_event_handler(self.val_handler, Events.EPOCH_COMPLETED) + +# self.val_handler.attach(self.trainer, +# self.train_evaluator, +# single_batch, # type: ignore +# validation=False) + out = self.trainer.run(single_batch, max_epochs=100) + return out + + def fit_debug(self: TrainerType, + train_loader: DataLoader, + val_loader: DataLoader) -> State: + train_loader = iter(train_loader) + train_subset = [next(train_loader), next(train_loader)] + val_loader = iter(val_loader) # type: ignore + val_subset = [next(val_loader), next(val_loader)] # type ignore + out = self.fit(train_subset, val_subset, epochs=6) # type: ignore + return out + + def _attach_checkpoint(self: TrainerType) -> TrainerType: + ckpt = { + 'model': self.model, + 'optimizer': self.optimizer + } + if self.checkpoint_dir is not None: + self.valid_evaluator.add_event_handler( + Events.COMPLETED, self.checkpoint, ckpt) + return self + + + def attach(self: TrainerType) -> TrainerType: + ra = RunningAverage(output_transform=lambda x: x) + ra.attach(self.trainer, "Train Loss") + self.pbar.attach(self.trainer, ['Train Loss']) +# self.val_pbar.attach(self.train_evaluator) + self.val_pbar.attach(self.valid_evaluator) + self.valid_evaluator.add_event_handler(Events.COMPLETED, + self.early_stop) + self = self._attach_checkpoint() + def graceful_exit(engine, e): + if isinstance(e, KeyboardInterrupt): + engine.terminate() + log.warn("CTRL-C caught. Exiting gracefully...") + else: + raise(e) + + self.trainer.add_event_handler(Events.EXCEPTION_RAISED, graceful_exit) + #self.train_evaluator.add_event_handler(Events.EXCEPTION_RAISED, + # graceful_exit) + self.valid_evaluator.add_event_handler(Events.EXCEPTION_RAISED, + graceful_exit) + return self + + +class AutoencoderTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + return inputs, inputs + + +class SequentialTrainerTouvlo(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + +# import pdb; pdb.set_trace() + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + titles = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + number_of_sentences = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + length_of_sentences = to_device(batch[4], + device=self.device, + non_blocking=self.non_blocking) + title_lengths = to_device(batch[5], + device=self.device, + non_blocking=self.non_blocking) + + return inputs, titles, targets, number_of_sentences, length_of_sentences, title_lengths + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, titles, targets, number_of_sentences, length_of_sentences, title_lengths = self.parse_batch(batch) + #import pdb; pdb.set_trace() + y_pred = self.model(inputs, number_of_sentences, length_of_sentences, titles, title_lengths) + return y_pred, targets + + + +class SequentialTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + +# import pdb; pdb.set_trace() + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + titles = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + features = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + lengths = to_device(batch[4], + device=self.device, + non_blocking=self.non_blocking) + title_lengths = to_device(batch[5], + device=self.device, + non_blocking=self.non_blocking) + + return inputs, titles, features, targets, lengths, title_lengths + + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + + # import pdb; pdb.set_trace() + inputs, titles, features, targets, lengths, title_lengths = self.parse_batch(batch) + y_pred = self.model(inputs, lengths, features, titles, title_lengths) + return y_pred, targets + +class Seq2seqTrainer(SequentialTrainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + lengths = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + return inputs, inputs, lengths + + +class TransformerTrainer(Trainer): + def parse_batch( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs = to_device(batch[0], + device=self.device, + non_blocking=self.non_blocking) + targets = to_device(batch[1], + device=self.device, + non_blocking=self.non_blocking) + mask_inputs = to_device(batch[2], + device=self.device, + non_blocking=self.non_blocking) + mask_targets = to_device(batch[3], + device=self.device, + non_blocking=self.non_blocking) + return inputs, targets, mask_inputs, mask_targets + + def get_predictions_and_targets( + self, + batch: List[torch.Tensor]) -> Tuple[torch.Tensor, ...]: + inputs, targets, mask_inputs, mask_targets = self.parse_batch(batch) + y_pred = self.model(inputs, + targets, + source_mask=mask_inputs, + target_mask=mask_targets) + targets = targets.view(-1) + y_pred = y_pred.view(targets.size(0), -1) + # TODO: BEAMSEARCH!! + return y_pred, targets