From 855c2dd6a3d10c819cfc7025fe0f38d4f5df57b8 Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Thu, 1 Mar 2018 22:41:14 +0100
Subject: [PATCH 01/10] make autoregressive decoder temporal stateful

---
 neuralmonkey/decoders/autoregressive.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/neuralmonkey/decoders/autoregressive.py b/neuralmonkey/decoders/autoregressive.py
index 7b83d3e5b..666a8b053 100644
--- a/neuralmonkey/decoders/autoregressive.py
+++ b/neuralmonkey/decoders/autoregressive.py
@@ -15,6 +15,7 @@
 from neuralmonkey.model.model_part import ModelPart, FeedDict, InitializerSpecs
 from neuralmonkey.logging import log, warn
 from neuralmonkey.model.sequence import EmbeddedSequence
+from neuralmonkey.model.stateful import TemporalStateful
 from neuralmonkey.nn.utils import dropout
 from neuralmonkey.tf_utils import get_variable, get_state_shape_invariants
 from neuralmonkey.vocabulary import Vocabulary, START_TOKEN, UNK_TOKEN_INDEX
@@ -93,7 +94,7 @@ class DecoderFeedables(NamedTuple(
 
 
 # pylint: disable=too-many-public-methods,too-many-instance-attributes
-class AutoregressiveDecoder(ModelPart):
+class AutoregressiveDecoder(ModelPart, TemporalStateful):
 
     # pylint: disable=too-many-arguments
     def __init__(self,
@@ -475,3 +476,19 @@ def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
             fd[self.train_mask] = weights
 
         return fd
+
+    @tensor
+    def temporal_states(self) -> tf.Tensor:
+        return tf.cond(
+            self.train_mode,
+            lambda: tf.transpose(self.train_output_states, [1, 0, 2])[:, :-2],
+            lambda: tf.transpose(
+                self.runtime_output_states, [1, 0, 2])[:, :-2])
+
+    @tensor
+    def temporal_mask(self) -> tf.Tensor:
+        return tf.cond(
+            self.train_mode,
+            lambda: tf.transpose(self.train_mask, [1, 0])[:, :-1],
+            lambda: tf.to_float(tf.transpose(
+                self.runtime_mask, [1, 0])[:, :-1]))

From 5cda779adade699851686df520fa64b5e8fa4c54 Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Fri, 2 Mar 2018 11:02:45 +0100
Subject: [PATCH 02/10] make the seq. labeler no assume encoder has an input
 sequence

---
 neuralmonkey/decoders/sequence_labeler.py | 34 +++++++++++++----------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py
index 89add3de3..4eb4f0f93 100644
--- a/neuralmonkey/decoders/sequence_labeler.py
+++ b/neuralmonkey/decoders/sequence_labeler.py
@@ -79,17 +79,19 @@ def logits(self) -> tf.Tensor:
         multiplication_3d = tf.squeeze(multiplication, squeeze_dims=[2])
 
         biases_3d = tf.expand_dims(tf.expand_dims(self.decoding_b, 0), 0)
+        logits = multiplication_3d + biases_3d
 
-        embedded_inputs = tf.expand_dims(
-            self.encoder.input_sequence.temporal_states, 2)
-        dweights_4d = tf.expand_dims(
-            tf.expand_dims(self.decoding_residual_w, 0), 0)
+        if hasattr(self.encoder, "input_sequence"):
+            embedded_inputs = tf.expand_dims(
+                self.encoder.input_sequence.temporal_states, 2)
+            dweights_4d = tf.expand_dims(
+                tf.expand_dims(self.decoding_residual_w, 0), 0)
 
-        dmultiplication = tf.nn.conv2d(
-            embedded_inputs, dweights_4d, [1, 1, 1, 1], "SAME")
-        dmultiplication_3d = tf.squeeze(dmultiplication, squeeze_dims=[2])
+            dmultiplication = tf.nn.conv2d(
+                embedded_inputs, dweights_4d, [1, 1, 1, 1], "SAME")
+            dmultiplication_3d = tf.squeeze(dmultiplication, squeeze_dims=[2])
 
-        logits = multiplication_3d + dmultiplication_3d + biases_3d
+            logits += dmultiplication_3d
         return logits
 
     @tensor
@@ -102,13 +104,15 @@ def decoded(self) -> tf.Tensor:
 
     @tensor
     def cost(self) -> tf.Tensor:
-        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            labels=self.train_targets, logits=self.logits)
-
-        # loss is now of shape [batch, time]. Need to mask it now by
-        # element-wise multiplication with weights placeholder
-        weighted_loss = loss * self.train_weights
-        return tf.reduce_sum(weighted_loss)
+        min_time = tf.minimum(tf.shape(self.train_targets)[1],
+                              tf.shape(self.logits)[1])
+
+        # pylint: disable=unsubscriptable-object
+        return tf.contrib.seq2seq.sequence_loss(
+            logits=self.logits[:, :min_time],
+            targets=self.train_targets[:, :min_time],
+            weights=self.encoder.temporal_mask[:, :min_time])
+        # pylint: enable=unsubscriptable-object
 
     @property
     def train_loss(self) -> tf.Tensor:

From feb6b3a7f3f107ac303434eccd35206de0bab07f Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Mon, 5 Mar 2018 10:35:59 +0100
Subject: [PATCH 03/10] make the names in labeler more general

---
 neuralmonkey/decoders/sequence_labeler.py | 32 +++++++++++------------
 neuralmonkey/runners/label_runner.py      |  2 +-
 tests/labeler.ini                         |  2 +-
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py
index 4eb4f0f93..f00480d75 100644
--- a/neuralmonkey/decoders/sequence_labeler.py
+++ b/neuralmonkey/decoders/sequence_labeler.py
@@ -1,11 +1,9 @@
-from typing import Optional, Union
-
 import tensorflow as tf
+from typeguard import check_argument_types
 
 from neuralmonkey.dataset import Dataset
 from neuralmonkey.model.model_part import ModelPart, FeedDict, InitializerSpecs
-from neuralmonkey.encoders.recurrent import RecurrentEncoder
-from neuralmonkey.encoders.facebook_conv import SentenceEncoder
+from neuralmonkey.model.stateful import TemporalStateful
 from neuralmonkey.vocabulary import Vocabulary
 from neuralmonkey.decorators import tensor
 from neuralmonkey.tf_utils import get_variable
@@ -17,22 +15,24 @@ class SequenceLabeler(ModelPart):
     # pylint: disable=too-many-arguments
     def __init__(self,
                  name: str,
-                 encoder: Union[RecurrentEncoder, SentenceEncoder],
+                 input_sequence: TemporalStateful,
                  vocabulary: Vocabulary,
                  data_id: str,
                  dropout_keep_prob: float = 1.0,
-                 save_checkpoint: Optional[str] = None,
-                 load_checkpoint: Optional[str] = None,
+                 save_checkpoint: str = None,
+                 load_checkpoint: str = None,
                  initializers: InitializerSpecs = None) -> None:
+        check_argument_types()
         ModelPart.__init__(self, name, save_checkpoint, load_checkpoint,
                            initializers)
 
-        self.encoder = encoder
+        self.input_sequence = input_sequence
         self.vocabulary = vocabulary
         self.data_id = data_id
         self.dropout_keep_prob = dropout_keep_prob
 
-        self.rnn_size = int(self.encoder.temporal_states.get_shape()[-1])
+        self.input_size = int(
+            self.input_sequence.temporal_states.get_shape()[-1])
 
         with self.use_scope():
             self.train_targets = tf.placeholder(
@@ -45,7 +45,7 @@ def __init__(self,
     def decoding_w(self) -> tf.Variable:
         return get_variable(
             name="state_to_word_W",
-            shape=[self.rnn_size, len(self.vocabulary)],
+            shape=[self.input_size, len(self.vocabulary)],
             initializer=tf.glorot_normal_initializer())
 
     @tensor
@@ -57,7 +57,7 @@ def decoding_b(self) -> tf.Variable:
 
     @tensor
     def decoding_residual_w(self) -> tf.Variable:
-        input_dim = self.encoder.input_sequence.dimension
+        input_dim = self.input_sequence.input_sequence.dimension
         return get_variable(
             name="emb_to_word_W",
             shape=[input_dim, len(self.vocabulary)],
@@ -71,19 +71,19 @@ def logits(self) -> tf.Tensor:
 
         # TODO dropout needs to be revisited
 
-        encoder_states = tf.expand_dims(self.encoder.temporal_states, 2)
+        intpus_states = tf.expand_dims(self.input_sequence.temporal_states, 2)
         weights_4d = tf.expand_dims(tf.expand_dims(self.decoding_w, 0), 0)
 
         multiplication = tf.nn.conv2d(
-            encoder_states, weights_4d, [1, 1, 1, 1], "SAME")
+            intpus_states, weights_4d, [1, 1, 1, 1], "SAME")
         multiplication_3d = tf.squeeze(multiplication, squeeze_dims=[2])
 
         biases_3d = tf.expand_dims(tf.expand_dims(self.decoding_b, 0), 0)
         logits = multiplication_3d + biases_3d
 
-        if hasattr(self.encoder, "input_sequence"):
+        if hasattr(self.input_sequence, "input_sequence"):
             embedded_inputs = tf.expand_dims(
-                self.encoder.input_sequence.temporal_states, 2)
+                self.input_sequence.input_sequence.temporal_states, 2)
             dweights_4d = tf.expand_dims(
                 tf.expand_dims(self.decoding_residual_w, 0), 0)
 
@@ -111,7 +111,7 @@ def cost(self) -> tf.Tensor:
         return tf.contrib.seq2seq.sequence_loss(
             logits=self.logits[:, :min_time],
             targets=self.train_targets[:, :min_time],
-            weights=self.encoder.temporal_mask[:, :min_time])
+            weights=self.input_sequence.temporal_mask[:, :min_time])
         # pylint: enable=unsubscriptable-object
 
     @property
diff --git a/neuralmonkey/runners/label_runner.py b/neuralmonkey/runners/label_runner.py
index 97d9baced..0f8ffbacb 100644
--- a/neuralmonkey/runners/label_runner.py
+++ b/neuralmonkey/runners/label_runner.py
@@ -87,7 +87,7 @@ def get_executable(self,
                        num_sessions: int) -> LabelRunExecutable:
         fetches = {
             "label_logprobs": self._decoder.logprobs,
-            "input_mask": self._decoder.encoder.input_sequence.temporal_mask}
+            "input_mask": self._decoder.input_sequence.temporal_mask}
 
         if compute_losses:
             fetches["loss"] = self._decoder.cost
diff --git a/tests/labeler.ini b/tests/labeler.ini
index 8b2c8db64..a6da1069d 100644
--- a/tests/labeler.ini
+++ b/tests/labeler.ini
@@ -63,7 +63,7 @@ vocabulary=<source_vocabulary>
 [decoder]
 class=decoders.sequence_labeler.SequenceLabeler
 name="tagger"
-encoder=<encoder>
+input_sequence=<encoder>
 data_id="tags"
 dropout_keep_prob=0.5
 vocabulary=<tags_vocabulary>

From 6ce64e851739d3d7066e04b3ec1bf56c5203bbfb Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Mon, 5 Mar 2018 10:39:59 +0100
Subject: [PATCH 04/10] add comment for computing loss

---
 neuralmonkey/decoders/sequence_labeler.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py
index f00480d75..aebfd9689 100644
--- a/neuralmonkey/decoders/sequence_labeler.py
+++ b/neuralmonkey/decoders/sequence_labeler.py
@@ -107,6 +107,11 @@ def cost(self) -> tf.Tensor:
         min_time = tf.minimum(tf.shape(self.train_targets)[1],
                               tf.shape(self.logits)[1])
 
+        # In case the labeler is stacked on a decoder which emits also an end
+        # symbol (or for some reason emits more symbol than we have in the
+        # ground truth labels), we trim the sequences to the length of a
+        # shorter one.
+
         # pylint: disable=unsubscriptable-object
         return tf.contrib.seq2seq.sequence_loss(
             logits=self.logits[:, :min_time],

From 8918dd4240a640d23edb6db38962cceeb1848694 Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Mon, 5 Mar 2018 10:59:14 +0100
Subject: [PATCH 05/10] fix mypy

---
 neuralmonkey/decoders/sequence_labeler.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py
index aebfd9689..09e6dd952 100644
--- a/neuralmonkey/decoders/sequence_labeler.py
+++ b/neuralmonkey/decoders/sequence_labeler.py
@@ -57,7 +57,8 @@ def decoding_b(self) -> tf.Variable:
 
     @tensor
     def decoding_residual_w(self) -> tf.Variable:
-        input_dim = self.input_sequence.input_sequence.dimension
+        input_dim = (
+            self.input_sequence.input_sequence.dimension)  # type: ignore
         return get_variable(
             name="emb_to_word_W",
             shape=[input_dim, len(self.vocabulary)],
@@ -82,8 +83,8 @@ def logits(self) -> tf.Tensor:
         logits = multiplication_3d + biases_3d
 
         if hasattr(self.input_sequence, "input_sequence"):
-            embedded_inputs = tf.expand_dims(
-                self.input_sequence.input_sequence.temporal_states, 2)
+            inputs_input = self.input_sequence.input_sequence  # type: ignore
+            embedded_inputs = tf.expand_dims(inputs_input.temporal_states, 2)
             dweights_4d = tf.expand_dims(
                 tf.expand_dims(self.decoding_residual_w, 0), 0)
 

From cb426d6d8de8141091ef6bf1287bb59c49d52e6a Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Wed, 7 Mar 2018 15:30:24 +0100
Subject: [PATCH 06/10] rename transformer dimension so it does not collide

---
 neuralmonkey/decoders/transformer.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/neuralmonkey/decoders/transformer.py b/neuralmonkey/decoders/transformer.py
index 00f517547..ed250974c 100644
--- a/neuralmonkey/decoders/transformer.py
+++ b/neuralmonkey/decoders/transformer.py
@@ -126,10 +126,15 @@ def __init__(self,
 
         self.encoder_states = get_attention_states(self.encoder)
         self.encoder_mask = get_attention_mask(self.encoder)
-        self.dimension = (
-            self.encoder_states.get_shape()[2].value)  # type: ignore
 
-        if self.embedding_size != self.dimension:
+        # This assertion (and the "int" type declaration below) here is because
+        # of mypy not being able to handle the tf.Tensor type.
+        assert self.encoder_states is not None
+
+        self.model_dimension = (
+            self.encoder_states.get_shape()[2].value)  # type: int
+
+        if self.embedding_size != self.model_dimension:
             raise ValueError("Model dimension and input embedding size"
                              "do not match")
 
@@ -140,7 +145,7 @@ def __init__(self,
 
     @property
     def output_dimension(self) -> int:
-        return self.dimension
+        return self.model_dimension
 
     def embed_inputs(self, inputs: tf.Tensor) -> tf.Tensor:
         embedded = tf.nn.embedding_lookup(self.embedding_matrix, inputs)
@@ -156,7 +161,7 @@ def embed_inputs(self, inputs: tf.Tensor) -> tf.Tensor:
             embedded *= math.sqrt(embedding_size)
 
         length = tf.shape(inputs)[1]
-        return embedded + position_signal(self.dimension, length)
+        return embedded + position_signal(self.model_dimension, length)
 
     @tensor
     def embedded_train_inputs(self) -> tf.Tensor:
@@ -241,7 +246,8 @@ def feedforward_sublayer(self, layer_input: tf.Tensor) -> tf.Tensor:
         ff_hidden = dropout(ff_hidden, self.dropout_keep_prob, self.train_mode)
 
         # Feed-forward output projection
-        ff_output = tf.layers.dense(ff_hidden, self.dimension, name="output")
+        ff_output = tf.layers.dense(
+            ff_hidden, self.model_dimension, name="output")
 
         # Apply dropout on the output projection
         ff_output = dropout(ff_output, self.dropout_keep_prob, self.train_mode)

From 9990b19b1cd659f1ee9941701d3c9e31eb0bee37 Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Thu, 8 Mar 2018 16:39:59 +0100
Subject: [PATCH 07/10] document labeling autoregressive decoder

---
 neuralmonkey/decoders/sequence_labeler.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py
index 09e6dd952..510a7ca28 100644
--- a/neuralmonkey/decoders/sequence_labeler.py
+++ b/neuralmonkey/decoders/sequence_labeler.py
@@ -10,7 +10,16 @@
 
 
 class SequenceLabeler(ModelPart):
-    """Classifier assing a label to each encoder's state."""
+    """Classifier assigning a label to each input state.
+
+    If the labeler output has an input sequence with embeddings, these are used
+    as additional input to the labeler.
+
+    Note that when the labeler is stacked on an autoregressive decoder, it
+    labels the symbol that is currently generated by the decoder, i.e., the
+    decoder's state has not yet been updated by putting the decoded symbol on
+    its input.
+    """
 
     # pylint: disable=too-many-arguments
     def __init__(self,

From 2c0b915903d045be778486058893c4872a45838c Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Fri, 9 Mar 2018 21:16:23 +0100
Subject: [PATCH 08/10] multi-task learning with labeler to tests

---
 tests/bpe.ini | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/bpe.ini b/tests/bpe.ini
index 482aede74..f9f07369d 100644
--- a/tests/bpe.ini
+++ b/tests/bpe.ini
@@ -94,10 +94,18 @@ data_id="target_bpe"
 max_output_len=10
 vocabulary=<bpe_vocabulary>
 
+[labeler]
+class=decoders.sequence_labeler.SequenceLabeler
+name="tagger"
+input_sequence=<decoder>
+data_id="target_bpe"
+dropout_keep_prob=0.5
+vocabulary=<bpe_vocabulary>
+
 [trainer]
 ; This block just fills the arguments of the trainer __init__ method.
 class=trainers.cross_entropy_trainer.CrossEntropyTrainer
-decoders=[<decoder>]
+decoders=[<decoder>,<labeler>]
 l2_weight=1.0e-8
 clip_norm=1.0
 optimizer=<adadelta>

From bc0caeb91a3e152835677e06866cb1833e617347 Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Tue, 13 Mar 2018 10:37:23 +0100
Subject: [PATCH 09/10] add runtime labeling to tests

---
 tests/bpe.ini | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/bpe.ini b/tests/bpe.ini
index f9f07369d..c37baa598 100644
--- a/tests/bpe.ini
+++ b/tests/bpe.ini
@@ -10,7 +10,7 @@ epochs=2
 train_dataset=<train_data>
 val_dataset=<val_data>
 trainer=<trainer>
-runners=[<runner>]
+runners=[<runner>,<lab_runner>]
 evaluation=[("target", evaluators.BLEU), ("target_greedy", "target", evaluators.BLEU)]
 val_preview_num_examples=5
 val_preview_input_series=["source", "target", "target_bpe"]
@@ -122,3 +122,8 @@ class=runners.GreedyRunner
 decoder=<decoder>
 postprocess=<bpe_postprocess>
 output_series="target_greedy"
+
+[lab_runner]
+class=runners.LabelRunner
+decoder=<labeler>
+output_series="tags"

From d0da72f4a18bbd9fd8aa28dbdf72a7e9cc77f19f Mon Sep 17 00:00:00 2001
From: Jindrich Libovicky <libovicky@ufal.mff.cuni.cz>
Date: Tue, 13 Mar 2018 10:40:30 +0100
Subject: [PATCH 10/10] fix </s> striping

---
 neuralmonkey/decoders/autoregressive.py   | 6 ++++--
 neuralmonkey/decoders/sequence_labeler.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/neuralmonkey/decoders/autoregressive.py b/neuralmonkey/decoders/autoregressive.py
index 666a8b053..3ffdc6356 100644
--- a/neuralmonkey/decoders/autoregressive.py
+++ b/neuralmonkey/decoders/autoregressive.py
@@ -479,14 +479,16 @@ def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
 
     @tensor
     def temporal_states(self) -> tf.Tensor:
+        # strip the last symbol which is </s>
         return tf.cond(
             self.train_mode,
-            lambda: tf.transpose(self.train_output_states, [1, 0, 2])[:, :-2],
+            lambda: tf.transpose(self.train_output_states, [1, 0, 2])[:, :-1],
             lambda: tf.transpose(
-                self.runtime_output_states, [1, 0, 2])[:, :-2])
+                self.runtime_output_states, [1, 0, 2])[:, :-1])
 
     @tensor
     def temporal_mask(self) -> tf.Tensor:
+        # strip the last symbol which is </s>
         return tf.cond(
             self.train_mode,
             lambda: tf.transpose(self.train_mask, [1, 0])[:, :-1],
diff --git a/neuralmonkey/decoders/sequence_labeler.py b/neuralmonkey/decoders/sequence_labeler.py
index 510a7ca28..f0a9b7974 100644
--- a/neuralmonkey/decoders/sequence_labeler.py
+++ b/neuralmonkey/decoders/sequence_labeler.py
@@ -17,7 +17,7 @@ class SequenceLabeler(ModelPart):
 
     Note that when the labeler is stacked on an autoregressive decoder, it
     labels the symbol that is currently generated by the decoder, i.e., the
-    decoder's state has not yet been updated by putting the decoded symbol on
+    decoder state has not yet been updated by putting the decoded symbol on
     its input.
     """