diff --git a/agents/algorithms/ppo/ppo.py b/agents/algorithms/ppo/ppo.py index e62ce94..6359b84 100644 --- a/agents/algorithms/ppo/ppo.py +++ b/agents/algorithms/ppo/ppo.py @@ -530,7 +530,7 @@ def _adjust_penalty(self, observ, old_policy_params, length): kl_change > 1.3 * self._config.kl_target, # pylint: disable=g-long-lambda lambda: tf.Print(self._penalty.assign( - self._penalty * 1.5), [0], 'increase penalty '), + self._penalty * 1.5 + 1e-8), [0], 'increase penalty '), float) maybe_decrease = tf.cond( kl_change < 0.7 * self._config.kl_target,