From 855c2dd6a3d10c819cfc7025fe0f38d4f5df57b8 Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Thu, 1 Mar 2018 22:41:14 +0100 Subject: [PATCH 01/10] make autoregressive decoder temporal stateful --- neuralmonkey/decoders/autoregressive.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/neuralmonkey/decoders/autoregressive.py b/neuralmonkey/decoders/autoregressive.py index 7b83d3e5b..666a8b053 100644 --- a/neuralmonkey/decoders/autoregressive.py +++ b/neuralmonkey/decoders/autoregressive.py @@ -15,6 +15,7 @@ from neuralmonkey.model.model_part import ModelPart, FeedDict, InitializerSpecs from neuralmonkey.logging import log, warn from neuralmonkey.model.sequence import EmbeddedSequence +from neuralmonkey.model.stateful import TemporalStateful from neuralmonkey.nn.utils import dropout from neuralmonkey.tf_utils import get_variable, get_state_shape_invariants from neuralmonkey.vocabulary import Vocabulary, START_TOKEN, UNK_TOKEN_INDEX @@ -93,7 +94,7 @@ class DecoderFeedables(NamedTuple( # pylint: disable=too-many-public-methods,too-many-instance-attributes -class AutoregressiveDecoder(ModelPart): +class AutoregressiveDecoder(ModelPart, TemporalStateful): # pylint: disable=too-many-arguments def __init__(self, @@ -475,3 +476,19 @@ def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd[self.train_mask] = weights return fd + + @tensor + def temporal_states(self) -> tf.Tensor: + return tf.cond( + self.train_mode, + lambda: tf.transpose(self.train_output_states, [1, 0, 2])[:, :-2], + lambda: tf.transpose( + self.runtime_output_states, [1, 0, 2])[:, :-2]) + + @tensor + def temporal_mask(self) -> tf.Tensor: + return tf.cond( + self.train_mode, + lambda: tf.transpose(self.train_mask, [1, 0])[:, :-1], + lambda: tf.to_float(tf.transpose( + self.runtime_mask, [1, 0])[:, :-1])) From 5cda779adade699851686df520fa64b5e8fa4c54 Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Fri, 2 Mar 2018 11:02:45 +0100 Subject: [PATCH 02/10] make the seq. labeler no assume encoder has an input sequence --- neuralmonkey/decoders/sequence_labeler.py | 34 +++++++++++++---------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py index 89add3de3..4eb4f0f93 100644 --- a/neuralmonkey/decoders/sequence_labeler.py +++ b/neuralmonkey/decoders/sequence_labeler.py @@ -79,17 +79,19 @@ def logits(self) -> tf.Tensor: multiplication_3d = tf.squeeze(multiplication, squeeze_dims=[2]) biases_3d = tf.expand_dims(tf.expand_dims(self.decoding_b, 0), 0) + logits = multiplication_3d + biases_3d - embedded_inputs = tf.expand_dims( - self.encoder.input_sequence.temporal_states, 2) - dweights_4d = tf.expand_dims( - tf.expand_dims(self.decoding_residual_w, 0), 0) + if hasattr(self.encoder, "input_sequence"): + embedded_inputs = tf.expand_dims( + self.encoder.input_sequence.temporal_states, 2) + dweights_4d = tf.expand_dims( + tf.expand_dims(self.decoding_residual_w, 0), 0) - dmultiplication = tf.nn.conv2d( - embedded_inputs, dweights_4d, [1, 1, 1, 1], "SAME") - dmultiplication_3d = tf.squeeze(dmultiplication, squeeze_dims=[2]) + dmultiplication = tf.nn.conv2d( + embedded_inputs, dweights_4d, [1, 1, 1, 1], "SAME") + dmultiplication_3d = tf.squeeze(dmultiplication, squeeze_dims=[2]) - logits = multiplication_3d + dmultiplication_3d + biases_3d + logits += dmultiplication_3d return logits @tensor @@ -102,13 +104,15 @@ def decoded(self) -> tf.Tensor: @tensor def cost(self) -> tf.Tensor: - loss = tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=self.train_targets, logits=self.logits) - - # loss is now of shape [batch, time]. Need to mask it now by - # element-wise multiplication with weights placeholder - weighted_loss = loss * self.train_weights - return tf.reduce_sum(weighted_loss) + min_time = tf.minimum(tf.shape(self.train_targets)[1], + tf.shape(self.logits)[1]) + + # pylint: disable=unsubscriptable-object + return tf.contrib.seq2seq.sequence_loss( + logits=self.logits[:, :min_time], + targets=self.train_targets[:, :min_time], + weights=self.encoder.temporal_mask[:, :min_time]) + # pylint: enable=unsubscriptable-object @property def train_loss(self) -> tf.Tensor: From feb6b3a7f3f107ac303434eccd35206de0bab07f Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Mon, 5 Mar 2018 10:35:59 +0100 Subject: [PATCH 03/10] make the names in labeler more general --- neuralmonkey/decoders/sequence_labeler.py | 32 +++++++++++------------ neuralmonkey/runners/label_runner.py | 2 +- tests/labeler.ini | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py index 4eb4f0f93..f00480d75 100644 --- a/neuralmonkey/decoders/sequence_labeler.py +++ b/neuralmonkey/decoders/sequence_labeler.py @@ -1,11 +1,9 @@ -from typing import Optional, Union - import tensorflow as tf +from typeguard import check_argument_types from neuralmonkey.dataset import Dataset from neuralmonkey.model.model_part import ModelPart, FeedDict, InitializerSpecs -from neuralmonkey.encoders.recurrent import RecurrentEncoder -from neuralmonkey.encoders.facebook_conv import SentenceEncoder +from neuralmonkey.model.stateful import TemporalStateful from neuralmonkey.vocabulary import Vocabulary from neuralmonkey.decorators import tensor from neuralmonkey.tf_utils import get_variable @@ -17,22 +15,24 @@ class SequenceLabeler(ModelPart): # pylint: disable=too-many-arguments def __init__(self, name: str, - encoder: Union[RecurrentEncoder, SentenceEncoder], + input_sequence: TemporalStateful, vocabulary: Vocabulary, data_id: str, dropout_keep_prob: float = 1.0, - save_checkpoint: Optional[str] = None, - load_checkpoint: Optional[str] = None, + save_checkpoint: str = None, + load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: + check_argument_types() ModelPart.__init__(self, name, save_checkpoint, load_checkpoint, initializers) - self.encoder = encoder + self.input_sequence = input_sequence self.vocabulary = vocabulary self.data_id = data_id self.dropout_keep_prob = dropout_keep_prob - self.rnn_size = int(self.encoder.temporal_states.get_shape()[-1]) + self.input_size = int( + self.input_sequence.temporal_states.get_shape()[-1]) with self.use_scope(): self.train_targets = tf.placeholder( @@ -45,7 +45,7 @@ def __init__(self, def decoding_w(self) -> tf.Variable: return get_variable( name="state_to_word_W", - shape=[self.rnn_size, len(self.vocabulary)], + shape=[self.input_size, len(self.vocabulary)], initializer=tf.glorot_normal_initializer()) @tensor @@ -57,7 +57,7 @@ def decoding_b(self) -> tf.Variable: @tensor def decoding_residual_w(self) -> tf.Variable: - input_dim = self.encoder.input_sequence.dimension + input_dim = self.input_sequence.input_sequence.dimension return get_variable( name="emb_to_word_W", shape=[input_dim, len(self.vocabulary)], @@ -71,19 +71,19 @@ def logits(self) -> tf.Tensor: # TODO dropout needs to be revisited - encoder_states = tf.expand_dims(self.encoder.temporal_states, 2) + intpus_states = tf.expand_dims(self.input_sequence.temporal_states, 2) weights_4d = tf.expand_dims(tf.expand_dims(self.decoding_w, 0), 0) multiplication = tf.nn.conv2d( - encoder_states, weights_4d, [1, 1, 1, 1], "SAME") + intpus_states, weights_4d, [1, 1, 1, 1], "SAME") multiplication_3d = tf.squeeze(multiplication, squeeze_dims=[2]) biases_3d = tf.expand_dims(tf.expand_dims(self.decoding_b, 0), 0) logits = multiplication_3d + biases_3d - if hasattr(self.encoder, "input_sequence"): + if hasattr(self.input_sequence, "input_sequence"): embedded_inputs = tf.expand_dims( - self.encoder.input_sequence.temporal_states, 2) + self.input_sequence.input_sequence.temporal_states, 2) dweights_4d = tf.expand_dims( tf.expand_dims(self.decoding_residual_w, 0), 0) @@ -111,7 +111,7 @@ def cost(self) -> tf.Tensor: return tf.contrib.seq2seq.sequence_loss( logits=self.logits[:, :min_time], targets=self.train_targets[:, :min_time], - weights=self.encoder.temporal_mask[:, :min_time]) + weights=self.input_sequence.temporal_mask[:, :min_time]) # pylint: enable=unsubscriptable-object @property diff --git a/neuralmonkey/runners/label_runner.py b/neuralmonkey/runners/label_runner.py index 97d9baced..0f8ffbacb 100644 --- a/neuralmonkey/runners/label_runner.py +++ b/neuralmonkey/runners/label_runner.py @@ -87,7 +87,7 @@ def get_executable(self, num_sessions: int) -> LabelRunExecutable: fetches = { "label_logprobs": self._decoder.logprobs, - "input_mask": self._decoder.encoder.input_sequence.temporal_mask} + "input_mask": self._decoder.input_sequence.temporal_mask} if compute_losses: fetches["loss"] = self._decoder.cost diff --git a/tests/labeler.ini b/tests/labeler.ini index 8b2c8db64..a6da1069d 100644 --- a/tests/labeler.ini +++ b/tests/labeler.ini @@ -63,7 +63,7 @@ vocabulary= [decoder] class=decoders.sequence_labeler.SequenceLabeler name="tagger" -encoder= +input_sequence= data_id="tags" dropout_keep_prob=0.5 vocabulary= From 6ce64e851739d3d7066e04b3ec1bf56c5203bbfb Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Mon, 5 Mar 2018 10:39:59 +0100 Subject: [PATCH 04/10] add comment for computing loss --- neuralmonkey/decoders/sequence_labeler.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py index f00480d75..aebfd9689 100644 --- a/neuralmonkey/decoders/sequence_labeler.py +++ b/neuralmonkey/decoders/sequence_labeler.py @@ -107,6 +107,11 @@ def cost(self) -> tf.Tensor: min_time = tf.minimum(tf.shape(self.train_targets)[1], tf.shape(self.logits)[1]) + # In case the labeler is stacked on a decoder which emits also an end + # symbol (or for some reason emits more symbol than we have in the + # ground truth labels), we trim the sequences to the length of a + # shorter one. + # pylint: disable=unsubscriptable-object return tf.contrib.seq2seq.sequence_loss( logits=self.logits[:, :min_time], From 8918dd4240a640d23edb6db38962cceeb1848694 Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Mon, 5 Mar 2018 10:59:14 +0100 Subject: [PATCH 05/10] fix mypy --- neuralmonkey/decoders/sequence_labeler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py index aebfd9689..09e6dd952 100644 --- a/neuralmonkey/decoders/sequence_labeler.py +++ b/neuralmonkey/decoders/sequence_labeler.py @@ -57,7 +57,8 @@ def decoding_b(self) -> tf.Variable: @tensor def decoding_residual_w(self) -> tf.Variable: - input_dim = self.input_sequence.input_sequence.dimension + input_dim = ( + self.input_sequence.input_sequence.dimension) # type: ignore return get_variable( name="emb_to_word_W", shape=[input_dim, len(self.vocabulary)], @@ -82,8 +83,8 @@ def logits(self) -> tf.Tensor: logits = multiplication_3d + biases_3d if hasattr(self.input_sequence, "input_sequence"): - embedded_inputs = tf.expand_dims( - self.input_sequence.input_sequence.temporal_states, 2) + inputs_input = self.input_sequence.input_sequence # type: ignore + embedded_inputs = tf.expand_dims(inputs_input.temporal_states, 2) dweights_4d = tf.expand_dims( tf.expand_dims(self.decoding_residual_w, 0), 0) From cb426d6d8de8141091ef6bf1287bb59c49d52e6a Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Wed, 7 Mar 2018 15:30:24 +0100 Subject: [PATCH 06/10] rename transformer dimension so it does not collide --- neuralmonkey/decoders/transformer.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/neuralmonkey/decoders/transformer.py b/neuralmonkey/decoders/transformer.py index 00f517547..ed250974c 100644 --- a/neuralmonkey/decoders/transformer.py +++ b/neuralmonkey/decoders/transformer.py @@ -126,10 +126,15 @@ def __init__(self, self.encoder_states = get_attention_states(self.encoder) self.encoder_mask = get_attention_mask(self.encoder) - self.dimension = ( - self.encoder_states.get_shape()[2].value) # type: ignore - if self.embedding_size != self.dimension: + # This assertion (and the "int" type declaration below) here is because + # of mypy not being able to handle the tf.Tensor type. + assert self.encoder_states is not None + + self.model_dimension = ( + self.encoder_states.get_shape()[2].value) # type: int + + if self.embedding_size != self.model_dimension: raise ValueError("Model dimension and input embedding size" "do not match") @@ -140,7 +145,7 @@ def __init__(self, @property def output_dimension(self) -> int: - return self.dimension + return self.model_dimension def embed_inputs(self, inputs: tf.Tensor) -> tf.Tensor: embedded = tf.nn.embedding_lookup(self.embedding_matrix, inputs) @@ -156,7 +161,7 @@ def embed_inputs(self, inputs: tf.Tensor) -> tf.Tensor: embedded *= math.sqrt(embedding_size) length = tf.shape(inputs)[1] - return embedded + position_signal(self.dimension, length) + return embedded + position_signal(self.model_dimension, length) @tensor def embedded_train_inputs(self) -> tf.Tensor: @@ -241,7 +246,8 @@ def feedforward_sublayer(self, layer_input: tf.Tensor) -> tf.Tensor: ff_hidden = dropout(ff_hidden, self.dropout_keep_prob, self.train_mode) # Feed-forward output projection - ff_output = tf.layers.dense(ff_hidden, self.dimension, name="output") + ff_output = tf.layers.dense( + ff_hidden, self.model_dimension, name="output") # Apply dropout on the output projection ff_output = dropout(ff_output, self.dropout_keep_prob, self.train_mode) From 9990b19b1cd659f1ee9941701d3c9e31eb0bee37 Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Thu, 8 Mar 2018 16:39:59 +0100 Subject: [PATCH 07/10] document labeling autoregressive decoder --- neuralmonkey/decoders/sequence_labeler.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py index 09e6dd952..510a7ca28 100644 --- a/neuralmonkey/decoders/sequence_labeler.py +++ b/neuralmonkey/decoders/sequence_labeler.py @@ -10,7 +10,16 @@ class SequenceLabeler(ModelPart): - """Classifier assing a label to each encoder's state.""" + """Classifier assigning a label to each input state. + + If the labeler output has an input sequence with embeddings, these are used + as additional input to the labeler. + + Note that when the labeler is stacked on an autoregressive decoder, it + labels the symbol that is currently generated by the decoder, i.e., the + decoder's state has not yet been updated by putting the decoded symbol on + its input. + """ # pylint: disable=too-many-arguments def __init__(self, From 2c0b915903d045be778486058893c4872a45838c Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Fri, 9 Mar 2018 21:16:23 +0100 Subject: [PATCH 08/10] multi-task learning with labeler to tests --- tests/bpe.ini | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/bpe.ini b/tests/bpe.ini index 482aede74..f9f07369d 100644 --- a/tests/bpe.ini +++ b/tests/bpe.ini @@ -94,10 +94,18 @@ data_id="target_bpe" max_output_len=10 vocabulary= +[labeler] +class=decoders.sequence_labeler.SequenceLabeler +name="tagger" +input_sequence= +data_id="target_bpe" +dropout_keep_prob=0.5 +vocabulary= + [trainer] ; This block just fills the arguments of the trainer __init__ method. class=trainers.cross_entropy_trainer.CrossEntropyTrainer -decoders=[] +decoders=[,] l2_weight=1.0e-8 clip_norm=1.0 optimizer= From bc0caeb91a3e152835677e06866cb1833e617347 Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Tue, 13 Mar 2018 10:37:23 +0100 Subject: [PATCH 09/10] add runtime labeling to tests --- tests/bpe.ini | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/bpe.ini b/tests/bpe.ini index f9f07369d..c37baa598 100644 --- a/tests/bpe.ini +++ b/tests/bpe.ini @@ -10,7 +10,7 @@ epochs=2 train_dataset= val_dataset= trainer= -runners=[] +runners=[,] evaluation=[("target", evaluators.BLEU), ("target_greedy", "target", evaluators.BLEU)] val_preview_num_examples=5 val_preview_input_series=["source", "target", "target_bpe"] @@ -122,3 +122,8 @@ class=runners.GreedyRunner decoder= postprocess= output_series="target_greedy" + +[lab_runner] +class=runners.LabelRunner +decoder= +output_series="tags" From d0da72f4a18bbd9fd8aa28dbdf72a7e9cc77f19f Mon Sep 17 00:00:00 2001 From: Jindrich Libovicky Date: Tue, 13 Mar 2018 10:40:30 +0100 Subject: [PATCH 10/10] fix striping --- neuralmonkey/decoders/autoregressive.py | 6 ++++-- neuralmonkey/decoders/sequence_labeler.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/neuralmonkey/decoders/autoregressive.py b/neuralmonkey/decoders/autoregressive.py index 666a8b053..3ffdc6356 100644 --- a/neuralmonkey/decoders/autoregressive.py +++ b/neuralmonkey/decoders/autoregressive.py @@ -479,14 +479,16 @@ def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: @tensor def temporal_states(self) -> tf.Tensor: + # strip the last symbol which is return tf.cond( self.train_mode, - lambda: tf.transpose(self.train_output_states, [1, 0, 2])[:, :-2], + lambda: tf.transpose(self.train_output_states, [1, 0, 2])[:, :-1], lambda: tf.transpose( - self.runtime_output_states, [1, 0, 2])[:, :-2]) + self.runtime_output_states, [1, 0, 2])[:, :-1]) @tensor def temporal_mask(self) -> tf.Tensor: + # strip the last symbol which is return tf.cond( self.train_mode, lambda: tf.transpose(self.train_mask, [1, 0])[:, :-1], diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py index 510a7ca28..f0a9b7974 100644 --- a/neuralmonkey/decoders/sequence_labeler.py +++ b/neuralmonkey/decoders/sequence_labeler.py @@ -17,7 +17,7 @@ class SequenceLabeler(ModelPart): Note that when the labeler is stacked on an autoregressive decoder, it labels the symbol that is currently generated by the decoder, i.e., the - decoder's state has not yet been updated by putting the decoded symbol on + decoder state has not yet been updated by putting the decoded symbol on its input. """