diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..a63a300 --- /dev/null +++ b/__init__.py @@ -0,0 +1,6 @@ +try: + from mittens.mittens.tf_mittens import Mittens, GloVe +except ImportError: + from mittens.mittens.np_mittens import Mittens, GloVe + +__version__ = "0.2.2" diff --git a/mittens/__init__.py b/mittens/__init__.py index b4d7b69..54617b4 100644 --- a/mittens/__init__.py +++ b/mittens/__init__.py @@ -1,6 +1,15 @@ try: + try: from mittens.tf_mittens import Mittens, GloVe + except: +# print("Failed mittens.tf_mittens") + from mittens.mittens.tf_mittens import Mittens, GloVe except ImportError: +# print("Failed ANY tf_mittens") + try: from mittens.np_mittens import Mittens, GloVe + except: +# print("Failed mittens.np_mittens") + from mittens.mittens.np_mittens import Mittens, GloVe -__version__ = "0.2" +__version__ = "0.2.2" diff --git a/mittens/mittens_base.py b/mittens/mittens_base.py index 815e529..3da9bbb 100644 --- a/mittens/mittens_base.py +++ b/mittens/mittens_base.py @@ -1,10 +1,14 @@ from copy import copy import random import sys +from time import time import numpy as np -from mittens.doc import BASE_DOC, MITTENS_PARAM_DESCRIPTION +try: + from mittens.doc import BASE_DOC, MITTENS_PARAM_DESCRIPTION +except: + from mittens.mittens.doc import BASE_DOC, MITTENS_PARAM_DESCRIPTION class MittensBase(object): @@ -31,6 +35,19 @@ def __init__(self, n=100, mittens=0.1, xmax=100, alpha=0.75, self.max_iter = max_iter self.errors = list() self.test_mode = test_mode + + def message(self, obj, timer=None): + if type(obj) != str: + obj = str(obj) + elapsed = 0 + if timer == 'start': + self._msg_time = time() + elif timer == 'stop': + elapsed = time() - self._msg_time + if elapsed > 0: + obj = obj + ' ({:.1f}s)'.format(elapsed) + print("\r" + obj, flush=True) + return def fit(self, X, @@ -69,14 +86,18 @@ def fit(self, embedding of the corresponding element in `vocab`. """ + self.message("Fitting mco {}".format(X.shape)) + if fixed_initialization is not None: assert self.test_mode, \ "Fixed initialization parameters can only be provided" \ " in test mode. Initialize {} with `test_mode=True`.". \ format(self.__class__.split(".")[-1]) + self.message(" Dimensions check") self._check_dimensions( X, vocab, initial_embedding_dict ) + self.message(" Initializing weights and log(mco)") weights, log_coincidence = self._initialize(X) return self._fit(X, weights, log_coincidence, vocab=vocab, @@ -163,7 +184,7 @@ def _progressbar(self, msg, iter_num): if self.display_progress and \ (iter_num + 1) % self.display_progress == 0: sys.stderr.write('\r') - sys.stderr.write("Iteration {}: {}".format(iter_num + 1, msg)) + sys.stderr.write("Iteration {}: {}\t\t\t".format(iter_num + 1, msg)) sys.stderr.flush() def __repr__(self): diff --git a/mittens/np_mittens.py b/mittens/np_mittens.py index b21a3a1..174d8db 100644 --- a/mittens/np_mittens.py +++ b/mittens/np_mittens.py @@ -16,8 +16,13 @@ """ import numpy as np -from mittens.mittens_base import randmatrix, noise -from mittens.mittens_base import MittensBase, GloVeBase +try: + from mittens.mittens_base import randmatrix, noise + from mittens.mittens_base import MittensBase, GloVeBase +except: + from mittens.mittens.mittens_base import randmatrix, noise + from mittens.mittens.mittens_base import MittensBase, GloVeBase + _FRAMEWORK = "NumPy" @@ -35,6 +40,11 @@ class Mittens(MittensBase): framework=_FRAMEWORK, second=_DESC.format(model=MittensBase._MODEL)) + def __init__(self, + **kwargs): + super().__init__(**kwargs) + self.message("NumPy Mittens initialized.") + @property def framework(self): return _FRAMEWORK diff --git a/mittens/tf_mittens.py b/mittens/tf_mittens.py index 6d3cf4f..c513331 100644 --- a/mittens/tf_mittens.py +++ b/mittens/tf_mittens.py @@ -17,8 +17,19 @@ except ImportError: import tensorflow as tf -from mittens.mittens_base import randmatrix, noise -from mittens.mittens_base import MittensBase, GloVeBase +from time import time + +try: + from mittens.mittens_base import randmatrix, noise + from mittens.mittens_base import MittensBase, GloVeBase +except: + from mittens.mittens.mittens_base import randmatrix, noise + from mittens.mittens.mittens_base import MittensBase, GloVeBase + + +from collections import deque + +__VER__ = '0.2.1.6' _FRAMEWORK = "TensorFlow" @@ -35,10 +46,86 @@ class Mittens(MittensBase): __doc__ = MittensBase.__doc__.format( framework=_FRAMEWORK, second=_DESC.format(model=MittensBase._MODEL)) + + def __init__(self, + DEBUG=False, + no_feeds=True, + save_folder=None, + save_iters=500, + save_opt_hist=True, + name='mittenstf', + **kwargs): + super().__init__(**kwargs) + self.DEBUG = DEBUG + self.name = name + self.save_iters = save_iters + self.save_opt_hist = save_opt_hist + self.no_feeds = no_feeds + self.message("Tensorflow ({}) Mittens v{} initialized with {}".format( + tf.__version__, + __VER__, + 'full in-GPU training (no memory feeds)' if self.no_feeds else 'memory feeds' + )) + self.save_folder = '' + if save_folder is not None: + if not os.path.isdir(save_folder): + os.makedirs(save_folder) + if os.path.isdir(save_folder): + self.save_folder = save_folder + self.message(" Saving in '{}' folder.".format(self.save_folder)) + else: + self.message(' No folder provided. Saving in current folder.') + + self.message(" Generating d={} embeddings for {} items".format( + self.n, + self.n_words)) + + self._last_saved_file = None + return @property def framework(self): return _FRAMEWORK + + + def save(self, filename): + fn = os.path.join(self.save_folder, filename) + embeds = self._get_embeds() + try: + np.save(fn, embeds) + self.message('') + self.message(" Embeddings file '{}' saved.".format(fn)) + res = fn + '.npy' + except: + res = None + return res + + + def _save_status(self, itr): + if self._last_saved_file is not None: + try: + os.remove(self._last_saved_file) + except: + self.message('') + self.message("Could not remove '{}'".format(self._last_saved_file)) + fn = '{}_i{}k'.format(self.name, int(itr / 1000)) + self._last_saved_file = self.save(fn) + + def _save_optimization_history(self, skip=5): + import matplotlib.pyplot as plt + plt.style.use('ggplot') + _ = plt.figure() + ax = plt.gca() + ax.plot(np.arange(skip, len(self.errors)), self.errors[skip:]) + ax.set_title('Mittens loss history (skipped first {} iters)'.format(skip)) + ax.set_xlabel('Iterations') + ax.set_ylabel('Loss') + ax.set_yscale('log') +# ax.set_xscale('log') + plt.savefig(os.path.join(self.save_folder, '{}_loss.png'.format(self.name))) + plt.close() + + def _fit(self, X, weights, log_coincidence, vocab=None, @@ -47,17 +134,28 @@ def _fit(self, X, weights, log_coincidence, if fixed_initialization is not None: raise AttributeError("Tensorflow version of Mittens does " "not support specifying initializations.") + + self.message("Preparing graph & session:", timer='start') # Start the session: + if hasattr(self, 'sess'): + self.sess.close() + self.sess = None + run_config = tf.RunOptions(report_tensor_allocations_upon_oom = True) tf.reset_default_graph() self.sess = tf.InteractiveSession() # Build the computation graph. + self.message(" Building graph") self._build_graph(vocab, initial_embedding_dict) # Optimizer set-up: - self.cost = self._get_cost_function() - self.optimizer = self._get_optimizer() + self.message(" Preparing cost/train function") + if self.no_feeds: + self.cost = self._get_cost_function(weights, log_coincidence) + else: + self.cost = self._get_cost_function_with_placeholders() + self.optimizer = self._get_train_func() # Set up logging for Tensorboard if self.log_dir: @@ -67,6 +165,7 @@ def _fit(self, X, weights, log_coincidence, log_writer = tf.summary.FileWriter(directory, flush_secs=1) # Run training + self.message(" Initializing variables") self.sess.run(tf.global_variables_initializer()) if self.test_mode: self.W_start = self.sess.run(self.W) @@ -75,28 +174,63 @@ def _fit(self, X, weights, log_coincidence, self.bc_start = self.sess.run(self.bc) merged_logs = tf.summary.merge_all() + t0 = time() + self._last_timings = deque(maxlen=1000) + self.message("Done preparation session", timer='stop') for i in range(1, self.max_iter+1): + t1 = time() + if not self.no_feeds: + feed_dict = { + self.weights: weights, + self.log_coincidence: log_coincidence + } + else: + feed_dict = None + _, loss, stats = self.sess.run( [self.optimizer, self.cost, merged_logs], - feed_dict={ - self.weights: weights, - self.log_coincidence: log_coincidence}) + feed_dict=feed_dict, + options=run_config, + ) # Keep track of losses if self.log_dir and i % 10 == 0: log_writer.add_summary(stats) self.errors.append(loss) - + t2 = time() + t_l = t2 - t1 + self._last_timings.append(t_l) + t_lap = np.mean(self._last_timings) + t_elapsed = t2 - t0 + t_total = t_lap * self.max_iter + t_remain = t_total - t_elapsed if loss < self.tol: # Quit early if tolerance is met self._progressbar("stopping with loss < self.tol", i) break else: - self._progressbar("loss: {}".format(loss), i) - + self._progressbar("loss: {}, time: {:.2f} s/itr, remain: {:.2f} hrs (elapsed: {:.2f} hrs out of total {:.2f} hrs)".format( + loss, + t_lap, + t_remain / 3600, + t_elapsed / 3600, + t_total / 3600), i) + + if (i % self.save_iters) == 0: + self._save_status(i) + if self.save_opt_hist: + self._save_optimization_history() + + + #endfor iters # Return the sum of the two learned matrices, as recommended # in the paper: - return self.sess.run(tf.add(self.W, self.C)) + self.save(self.name+'_embeds') + return self._get_embeds() + + + def _get_embeds(self): + return self.sess.run(tf.add(self.W, self.C)) def _build_graph(self, vocab, initial_embedding_dict): """Builds the computatation graph. @@ -148,10 +282,35 @@ def _build_graph(self, vocab, initial_embedding_dict): self.bc = self._weight_init(self.n_words, 1, 'bc') self.model = tf.tensordot(self.W, tf.transpose(self.C), axes=1) + \ - tf.tensordot(self.bw, tf.transpose(self.ones), axes=1) + \ - tf.tensordot(self.ones, tf.transpose(self.bc), axes=1) + tf.tensordot(self.bw, tf.transpose(self.ones), axes=1) + \ + tf.tensordot(self.ones, tf.transpose(self.bc), axes=1) + + def _get_cost_function(self, weights, log_coincidence): + """Compute the cost of the Mittens objective function. + + If self.mittens = 0, this is the same as the cost of GloVe. + """ + self.weights = tf.Variable(weights, + dtype=tf.float32, + trainable=False) + self.log_coincidence = tf.Variable(log_coincidence, + dtype=tf.float32, + trainable=False) + self.diffs = tf.subtract(self.model, self.log_coincidence) + cost = tf.reduce_sum( + 0.5 * tf.multiply(self.weights, tf.square(self.diffs))) + if self.mittens > 0: + self.mittens = tf.constant(self.mittens, tf.float32) + cost += self.mittens * tf.reduce_sum( + tf.multiply( + self.has_embedding, + self._tf_squared_euclidean( + tf.add(self.W, self.C), + self.original_embedding))) + tf.summary.scalar("cost", cost) + return cost - def _get_cost_function(self): + def _get_cost_function_with_placeholders(self): """Compute the cost of the Mittens objective function. If self.mittens = 0, this is the same as the cost of GloVe. @@ -160,6 +319,7 @@ def _get_cost_function(self): tf.float32, shape=[self.n_words, self.n_words]) self.log_coincidence = tf.placeholder( tf.float32, shape=[self.n_words, self.n_words]) + self.diffs = tf.subtract(self.model, self.log_coincidence) cost = tf.reduce_sum( 0.5 * tf.multiply(self.weights, tf.square(self.diffs))) @@ -174,23 +334,28 @@ def _get_cost_function(self): tf.summary.scalar("cost", cost) return cost + @staticmethod def _tf_squared_euclidean(X, Y): """Squared Euclidean distance between the rows of `X` and `Y`. """ return tf.reduce_sum(tf.pow(tf.subtract(X, Y), 2), axis=1) - def _get_optimizer(self): + def _get_train_func(self): """Uses Adagrad to optimize the GloVe/Mittens objective, as specified in the GloVe paper. """ optim = tf.train.AdagradOptimizer(self.learning_rate) - gradients = optim.compute_gradients(self.cost) - if self.log_dir: - for name, (g, v) in zip(['W', 'C', 'bw', 'bc'], gradients): - tf.summary.histogram("{}_grad".format(name), g) - tf.summary.histogram("{}_vals".format(name), v) - return optim.apply_gradients(gradients) + if self.DEBUG: + gradients = optim.compute_gradients(self.cost) + if self.log_dir: + for name, (g, v) in zip(['W', 'C', 'bw', 'bc'], gradients): + tf.summary.histogram("{}_grad".format(name), g) + tf.summary.histogram("{}_vals".format(name), v) + return optim.apply_gradients(gradients) + else: + return optim.minimize(self.cost, + global_step=tf.train.get_or_create_global_step()) def _weight_init(self, m, n, name): """ @@ -212,24 +377,43 @@ class GloVe(Mittens, GloVeBase): second=_DESC.format(model=GloVeBase._MODEL)) -if __name__ == '__main__': - X = np.array([ - [10.0, 2.0, 3.0, 4.0], - [ 2.0, 10.0, 4.0, 1.0], - [ 3.0, 4.0, 10.0, 2.0], - [ 4.0, 1.0, 2.0, 10.0]]) +def _make_word_word_matrix(n=50): + """Returns a symmetric matrix where the entries are drawn from a + Poisson distribution""" + base = np.random.zipf(2, size=(n, n)) - 1 + return base + base.T - glove = GloVe(n=5, max_iter=5000) +if __name__ == '__main__': + SIMPLE_TEST = False + USE_FULL_GPU = True + + if SIMPLE_TEST: + X = np.array([ + [10.0, 2.0, 3.0, 4.0], + [ 2.0, 10.0, 4.0, 1.0], + [ 3.0, 4.0, 10.0, 2.0], + [ 4.0, 1.0, 2.0, 10.0]]) + embed_size = 4 + else: + X = _make_word_word_matrix(13000) + embed_size = 128 + + glove = GloVe(n=embed_size, + save_folder='mittens_models', + save_iters=100, + max_iter=1000, + DEBUG=False, + no_feeds=USE_FULL_GPU) G = glove.fit(X) - + print("\nLearned vectors:") print(G) - + print("We expect the dot product of learned vectors " "to be proportional to the co-occurrence counts. " "Let's see how close we came:") - + corr = np.corrcoef(G.dot(G.T).ravel(), X.ravel())[0][1] - print("Pearson's R: {} ".format(corr)) +