"""
MIT License

Copyright (c) 2022 Author(s)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""

import math
import tensorflow as tf
import tensorflow_addons as tfa 
import numpy as np

class MyScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, learning_rate):
        self.learning_rate = learning_rate
    def __call__(self, step):
        return self.learning_rate


# Ref: https://github.com/keras-team/keras/blob/v2.6.0/keras/optimizer_v2/learning_rate_schedule.py#L549-L638
class CosineDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
  """A LearningRateSchedule that uses a cosine decay schedule.
  See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),
  SGDR: Stochastic Gradient Descent with Warm Restarts.
  When training a model, it is often useful to lower the learning rate as
  the training progresses. This schedule applies a cosine decay function
  to an optimizer step, given a provided initial learning rate.
  It requires a `step` value to compute the decayed learning rate. You can
  just pass a TensorFlow variable that you increment at each training step.
  The schedule a 1-arg callable that produces a decayed learning
  rate when passed the current optimizer step. This can be useful for changing
  the learning rate value across different invocations of optimizer functions.
  It is computed as:
  ```python
  def decayed_learning_rate(step):
    step = min(step, decay_steps)
    cosine_decay = 0.5 * (1 + cos(pi * min(step, decay_steps) / decay_steps))
    decayed = (1 - alpha) * cosine_decay + alpha
    return initial_learning_rate * decayed
  ```
  Example usage:
  ```python
  decay_steps = 1000
  lr_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(
      initial_learning_rate, decay_steps)
  ```
  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
  as the learning rate. The learning rate schedule is also serializable and
  deserializable using `tf.keras.optimizers.schedules.serialize` and
  `tf.keras.optimizers.schedules.deserialize`.
  Returns:
    A 1-arg callable learning rate schedule that takes the current optimizer
    step and outputs the decayed learning rate, a scalar `Tensor` of the same
    type as `initial_learning_rate`.
  """

  def __init__(
      self,
      initial_learning_rate,
      decay_steps,
      alpha=0.0,
      name=None):
    """Applies cosine decay to the learning rate.
    Args:
      initial_learning_rate: A scalar `float32` or `float64` Tensor or a
        Python number. The initial learning rate.
      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
        Number of steps to decay over.
      alpha: A scalar `float32` or `float64` Tensor or a Python number.
        Minimum learning rate value as a fraction of initial_learning_rate.
      name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
    """
    super(CosineDecay, self).__init__()

    self.initial_learning_rate = initial_learning_rate
    self.decay_steps = decay_steps
    self.alpha = alpha
    self.name = name

  def __call__(self, step):
    with tf.name_scope(self.name or "CosineDecay"):
      initial_learning_rate = tf.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      decay_steps = tf.cast(self.decay_steps, dtype)

      global_step_recomp = tf.cast(step, dtype)
      global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
      completed_fraction = global_step_recomp / decay_steps
      cosine_decayed = 0.5 * (1.0 + tf.cos(
          tf.constant(math.pi) * completed_fraction))

      decayed = (1 - self.alpha) * cosine_decayed + self.alpha
      return tf.multiply(initial_learning_rate, decayed)

  def get_config(self):
    return {
        "initial_learning_rate": self.initial_learning_rate,
        "decay_steps": self.decay_steps,
        "alpha": self.alpha,
        "name": self.name
    }


class CosineDecayLinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule): 
  """A LearningRateSchedule that uses a cosine decay schedule w/ linear warmup.
  See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),
  SGDR: Stochastic Gradient Descent with Warm Restarts.
  When training a model, it is often useful to lower the learning rate as
  the training progresses. This schedule applies a linear warmup before 
  a cosine decay function to an optimizer step.
  It requires a `step` value to compute the decayed learning rate. You can
  just pass a TensorFlow variable that you increment at each training step.
  The schedule a 1-arg callable that produces a decayed learning
  rate when passed the current optimizer step. This can be useful for changing
  the learning rate value across different invocations of optimizer functions.
  The cosine decay is computed as:
  ```python
  def decayed_learning_rate(step):
    step = min(step, decay_steps)
    cosine_decay = 0.5 * (1 + cos(pi * min(step, decay_steps) / decay_steps))
    decayed = (1 - alpha) * cosine_decay + alpha
    return initial_learning_rate * decayed
  ```
  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
  as the learning rate. The learning rate schedule is also serializable and
  deserializable using `tf.keras.optimizers.schedules.serialize` and
  `tf.keras.optimizers.schedules.deserialize`.
  Returns:
    A 1-arg callable learning rate schedule that takes the current optimizer
    step and outputs the decayed learning rate, a scalar `Tensor` of the same
    type as `initial_learning_rate`.
  """

  def __init__(
      self,
      initial_learning_rate,
      max_learning_rate,
      warmup_steps,
      decay_steps,
      alpha=0.0,
      name=None):
    """Applies cosine decay to the learning rate.
    Args:
      initial_learning_rate: A scalar `float32` or `float64` Tensor or a
        Python number. The initial learning rate.
      max_learning_rate: A scalar `float32` or `float64` Tensor or a
        Python number. The learning rate at the end of the linear warmup.
      warmup_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
        Number of steps of the linear phase.
      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
        Number of steps of the cosine phase to decay over.
        Should be greater than warmup_steps.
      alpha: A scalar `float32` or `float64` Tensor or a Python number.
        Minimum learning rate value as a fraction of max_learning_rate.
      name: String. Optional name of the operation.
        Defaults to 'CosineDecayLinearWarmup'.
    """
    super(CosineDecayLinearWarmup, self).__init__()
    assert warmup_steps < decay_steps

    self.initial_learning_rate = initial_learning_rate
    self.max_learning_rate = max_learning_rate
    self.warmup_steps = warmup_steps
    self.decay_steps = decay_steps
    self.alpha = alpha
    self.name = name

  def __call__(self, step):
    with tf.name_scope(self.name or "CosineDecayLinearWarmup"):
      initial_learning_rate = tf.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      max_learning_rate = tf.cast(self.max_learning_rate, dtype)
      warmup_steps = tf.cast(self.warmup_steps, dtype)
      decay_steps = tf.cast(self.decay_steps, dtype)
      alpha = tf.cast(self.alpha, dtype)
      global_step_recomp = tf.cast(step, dtype)

      if tf.less_equal(step, warmup_steps):
        completed_fraction = step / warmup_steps
        current_learning_rate = completed_fraction * (
            max_learning_rate - initial_learning_rate) + initial_learning_rate

      else:
        decay_steps = decay_steps - warmup_steps
        global_step_recomp = global_step_recomp - warmup_steps
        global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
        completed_fraction = global_step_recomp / decay_steps
        cosine_decayed = 0.5 * (1.0 + tf.cos(
            tf.constant(math.pi) * completed_fraction))
        decayed = (1 - alpha) * cosine_decayed + alpha
        current_learning_rate = tf.multiply(max_learning_rate, decayed)

      return current_learning_rate

  def get_config(self):
    return {
        "initial_learning_rate": self.initial_learning_rate,
        "max_learning_rate": self.max_learning_rate,
        "warmup_steps": self.warmup_steps,
        "decay_steps": self.decay_steps,
        "alpha": self.alpha,
        "name": self.name
    }


class OptimizerManager():
    """
    # References
    Module: tf.keras.optimizers.schedules
    https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/schedules
    Module: tf.keras.optimizers
    https://www.tensorflow.org/api_docs/python/tf/keras/optimizers
    Module: tfa.optimizers
    https://www.tensorflow.org/addons/api_docs/python/tfa/optimizers
    Most of the following docstrings are based on them.
    # How to implement your own chedule objct
    Ref (accessed July 11, 2021): https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
    To implement your own schedule object, you should implement the `__call__`
    method, which takes a `step` argument (scalar integer tensor, the
    current training step count).
    Like for any other Keras object, you can also optionally
    make your object serializable by implementing the `get_config`
    and `from_config` methods.
    Example:
    class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
        def __init__(self, initial_learning_rate):
            self.initial_learning_rate = initial_learning_rate
        def __call__(self, step):
            return self.initial_learning_rate / (step + 1)
    optimizer = tf.keras.optimizers.SGD(learning_rate=MyLRSchedule(0.1))
      """
    def __init__(self, name_scheduler, name_optimizer, 
        kwargs_scheduler=dict(), kwargs_optimizer=dict()):
        """
        # Args
        name_scheduler: Str.
        name_optimizer: Str.
        kwargs_scheduler: Dict. Should match the arguments for the scheduler 
            you will use.
        kwargs_optimizer: Dict. Should match the arguments for the optimizer 
            you will use. 
        # Returns
        optimizer: Optimizer (tf.keras.optimizers).
        """
        # Current support
        dc_available_sc = {
            "Constant": self.ConstantScheduler,
            "PolynomialDecay": self.PolynomialDecayScheduler,
            "PiecewiseConstantDecay": self.PiecewiseConstantDecayScheduler,
            "ExponentialDecay": self.ExponentialDecayScheduler,
            "InverseTimeDecay": self.InverseTimeDecayScheduler,
            "CosineDecay": self.CosineDecayScheduler,
            "CosineDecayLinearWarmup": self.CosineDecayLinearWarmupScheduler,
        }
        dc_available_op = {
            "Adam": self.AdamOptimizer,
            "SGD": self.SGDOptimizer,
            "Adadelta": self.AdadeltaOptimizer,
            "Adagrad": self.AdagradOptimizer,
            "Adamax": self.AdamaxOptimizer,
            "RMSprop": self.RMSpropOptimizer,
            "COCOB": self.COCOBOptimizer,
            "ConditionalGradient": self.ConditionalGradientOptimizer, # buggy
            "LAMB": self.LAMBOptimizer,
            "LazyAdam": self.LazyAdamOptimizer,
            "NovoGrad": self.NovoGradOptimizer,
            "ProximalAdagrad": self.ProximalAdagradOptimizer, # buggy
            "RectifiedAdam": self.RectifiedAdamOptimizer,
            "Yogi": self.YogiOptimizer,
            #"Nadam": self.NadamOptimizer, #LearningRateSchedules not supported
        }

        # Assertion
        if not name_optimizer in dc_available_op.keys():
            raise ValueError(
                "Wrong name_optimizer. It should be in {}. Given {}.".\
                format(list(dc_available_op.keys()), name_optimizer))
        if not name_scheduler in dc_available_sc.keys():
            raise ValueError(
                "Wrong name_scheduler. It should be in {}: Given {}.".\
                format(list(dc_available_sc.keys()), name_scheduler))

        # Define scheduler
        self.scheduler = dc_available_sc[name_scheduler](**kwargs_scheduler)

        # Define optimizer
        self.optimizer = dc_available_op[name_optimizer](
            learning_rate=self.scheduler, **kwargs_optimizer)

    def __call__(self):
        return self.optimizer

    # Schedulers
    def ConstantScheduler(self, learning_rate):
        """
        # Args
        learning_rate: Float. 
        """
        scheduler = MyScheduler(learning_rate)
        return scheduler


    def PolynomialDecayScheduler(self, initial_learning_rate, decay_steps,
        end_learning_rate=0.0001, power=1.0, cycle=False, name=None):
        """
        # Definition
        def decayed_learning_rate(step):
            step = min(step, decay_steps)
            return ((initial_learning_rate - end_learning_rate) *
                (1 - step / decay_steps) ^ (power)
                ) + end_learning_rate
        If cycle is True then a multiple of decay_steps is used, the first 
        one that is bigger than step.
        def decayed_learning_rate(step):
            decay_steps = decay_steps * ceil(step / decay_steps)
            return ((initial_learning_rate - end_learning_rate) *
                (1 - step / decay_steps) ^ (power)
                ) + end_learning_rate
        # Args
        initial_learning_rate: A scalar float32 or float64 Tensor or a Python
            number. The initial learning rate.
        decay_steps: A scalar int32 or int64 Tensor or a Python number. Must 
            be positive. See the decay computation above.
        end_learning_rate: A scalar float32 or float64 Tensor or a Python 
            number. The minimal end learning rate.
        power: A scalar float32 or float64 Tensor or a Python number. 
            The power of the polynomial. Defaults to linear, 1.0.
        cycle: A boolean, whether or not it should cycle beyond decay_steps.
        name: String. Optional name of the operation. 
            Defaults to 'PolynomialDecay'.
        """
        scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=initial_learning_rate, 
            decay_steps=decay_steps, end_learning_rate=end_learning_rate,
            power=power, cycle=cycle, name=name)
        return scheduler

    def PiecewiseConstantDecayScheduler(self, boundaries, values, name=None):
        """
        # Definition
        # Args
        boundaries: A list of Tensors or ints or floats with strictly
            increasing entries, and with all elements having the same type as
            the optimizer step.
        values: A list of Tensors or floats or ints that specifies the
            learning rates for the intervals defined by boundaries. 
            It should have one more element than boundaries, 
            and all elements should have the same type.
        name: A string. Optional name of the operation. 
            Defaults to 'PiecewiseConstant'.
        """
        scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            boundaries=boundaries, values=values, name=name)
        return scheduler

    def ExponentialDecayScheduler(self, initial_learning_rate, decay_steps,
        decay_rate, staircase=False, name=None):
        """
        # Remark
        If decay_rate = 1.0, the learning rate will be constant.
        If decay_rate < 1.0, the learning rate will decrease and
            will converge to 0.
        If decay_rate > 1.0, the learning rate will increase and
            will diverge at some step.
        # Definition
        def decayed_learning_rate(step):
            return initial_learning_rate * decay_rate^(step / decay_steps)
        If the argument staircase is True, then step / decay_steps is 
        an integer division and the decayed learning rate follows 
        a staircase function.
        # Args
        initial_learning_rate: A scalar float32 or float64 Tensor 
            or a Python number. The initial learning rate.
        decay_steps: A scalar int32 or int64 Tensor or a Python number. 
            Must be positive. See the decay computation above.
        decay_rate: A scalar float32 or float64 Tensor or 
            a Python number. The decay rate.
        staircase: Boolean. If True decay the learning rate 
            at discrete intervals
        name: String. Optional name of the operation. 
            Defaults to 'ExponentialDecay'.
        """
        scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=initial_learning_rate, 
            decay_steps=decay_steps, decay_rate=decay_rate, 
            staircase=staircase, name=name)
        return scheduler

    def InverseTimeDecayScheduler(initial_learning_rate, decay_steps,
        decay_rate, staircase=False, name=None):
        """
        # Remark
        If decay_rate = 0.0, the learning rate will be constant.
        If decay_rate > 0.0, the learning rate will decrease and
            will converge to 0.0.
        If decay_rate < 0.0, the learning rate will increase and
            will be infinite at step = decay_step.
        # Definition
        def decayed_learning_rate(step):
            return initial_learning_rate/(1+decay_rate*step/decay_step)
        or, if staircase is True, as:
        def decayed_learning_rate(step):
            return initial_learning_rate/(1+decay_rate*floor(step/decay_step))
        # Arg
        initial_learning_rate: A scalar float32 or float64 Tensor 
            or a Python number. The initial learning rate.
        decay_steps: How often to apply decay.
        decay_rate: A Python number. The decay rate.
        staircase: Whether to apply decay in a discrete staircase, 
            as opposed to continuous, fashion.
        name: String. Optional name of the operation. Defaults to 
            'InverseTimeDecay'.
        """
        if decay_rate < 0:
            assert decay_steps
        scheduler = tf.keras.optimizers.schedules.InverseTimeDecay(
            initial_learning_rate, decay_steps, decay_rate, 
            staircase=False, name=None)
        return scheduler

    def CosineDecayScheduler(self, initial_learning_rate, decay_steps, 
        alpha=0.0, name=None):
        """
        # Remark
        After decay_steps steps, the learning rate bacomes 
        stable and get to be alpha, defaults to 0.0.
        # Definition
        def decayed_learning_rate(step):
            step = min(step, decay_steps)
            cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
            decayed = (1 - alpha) * cosine_decay + alpha
            return initial_learning_rate * decayed
        # Args
        initial_learning_rate: A scalar float32 or float64 Tensor or 
            a Python number. The initial learning rate.
        decay_steps: A scalar int32 or int64 Tensor or a Python number. 
            Number of steps to decay over.
        alpha: A scalar float32 or float64 Tensor or a Python number. 
            Minimum learning rate value as a fraction of initial_learning_rate.
        name: String. Optional name of the operation. 
            Defaults to 'CosineDecay'.
        """
        scheduler = CosineDecay(
            initial_learning_rate=initial_learning_rate, 
            decay_steps=decay_steps, alpha=alpha, name=name)
        return scheduler

    def CosineDecayLinearWarmupScheduler(
            initial_learning_rate,
            max_learning_rate,
            warmup_steps,
            decay_steps,
            alpha=0.0,
            name=None):
        """Applies cosine decay with linear warmup to the learning rate.
        Args:
        initial_learning_rate: A scalar `float32` or `float64` Tensor or a
            Python number. The initial learning rate.
        max_learning_rate: A scalar `float32` or `float64` Tensor or a
            Python number. The learning rate at the end of the linear warmup.
        warmup_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
            Number of steps of the linear phase.
        decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
            Number of steps of the cosine phase to decay over.
            Should be greater than warmup_steps.
        alpha: A scalar `float32` or `float64` Tensor or a Python number.
            Minimum learning rate value as a fraction of max_learning_rate.
        name: String. Optional name of the operation.
            Defaults to 'CosineDecayLinearWarmup'.
        """        
        scheduler = CosineDecayLinearWarmup(
            initial_learning_rate=initial_learning_rate,
            max_learning_rate=max_learning_rate,
            warmup_steps=warmup_steps,
            decay_steps=decay_steps,
            alpha=alpha,
            name=name)
        return scheduler

    # Optimizers
    def SGDOptimizer(self, learning_rate, momentum=0.0, nesterov=False,
        name='SGD', **kwargs):
        """
        # Definition
        Update rule for parameter w with gradient g when momentum is 0:
            w = w - learning_rate * g
        Update rule when momentum is larger than 0:
            velocity = momentum * velocity - learning_rate * g
            w = w * velocity
        When nesterov=False, this rule becomes:
            velocity = momentum * velocity - learning_rate * g
            w = w + momentum * velocity - learning_rate * g
        # Args
        learning_rate: A Tensor, floating point value, or a schedule that is a
            tf.keras.optimizers.schedules.LearningRateSchedule, or a callable 
            that takes no arguments and returns the actual value to use. The 
            learning rate. Defaults to 0.01.
        momentum: float hyperparameter >= 0 that accelerates gradient descent
            in the relevant direction and dampens oscillations. Defaults to 0,
             i.e., vanilla gradient descent.
        nesterov: boolean. Whether to apply Nesterov momentum. 
            Defaults to False.
        name: Optional name prefix for the operations created when applying
            gradients. Defaults to "SGD".
        **kwargs: Keyword arguments. Allowed to be one of
            "clipnorm" or "clipvalue". "clipnorm" (float) clips gradients
            by norm; "clipvalue" (float) clips gradients by value.
        """
        optimizer = tf.keras.optimizers.SGD(
            learning_rate=learning_rate, momentum=momentum, nesterov=nesterov,
            name=name, **kwargs)
        return optimizer

    def AdamOptimizer(self, learning_rate, beta_1=0.9, beta_2=0.999,
        epsilon=1e-07, amsgrad=False, name='Adam', **kwargs):
        """
        # Args
        learning_rate: A Tensor, floating point value, or a schedule that is
            a tf.keras.optimizers.schedules.LearningRateSchedule, or a 
            callable that takes no arguments and returns the actual value 
            to use, The learning rate. Defaults to 0.001.
        beta_1: A float value or a constant float tensor, or a callable that
            takes no arguments and returns the actual value to use. The 
            exponential decay rate for the 1st moment estimates. Defaults 
            to 0.9.
        beta_2: A float value or a constant float tensor, or a callable 
            that takes no arguments and returns the actual value to use,
            The exponential decay rate for the 2nd moment estimates. 
            Defaults to 0.999.
        epsilon: A small constant for numerical stability. This epsilon 
            is "epsilon hat" in the Kingma and Ba paper (in the formula just 
            before Section 2.1), not the epsilon in Algorithm 1 of the paper. 
            Defaults to 1e-7.
            The default value of 1e-7 for epsilon might not be a good default
            in general. For example, when training an Inception network on 
            ImageNet a current good choice is 1.0 or 0.1. Note that since 
            Adam uses the formulation just before Section 2.1 of the Kingma 
            and Ba paper rather than the formulation in Algorithm 1, the 
            "epsilon" referred to here is "epsilon hat" in the paper.
        amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
            from the paper "On the Convergence of Adam and beyond". 
            Defaults to False.
        name: Optional name for the operations created when applying gradients.
            Defaults to "Adam".
        **kwargs: Keyword arguments. Allowed to be one of "clipnorm" or 
            "clipvalue". "clipnorm" (float) clips gradients by norm; 
            "clipvalue" (float) clips gradients by value.
        """
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2,
            epsilon=epsilon, amsgrad=amsgrad, name=name, **kwargs)
        return optimizer

    def AdadeltaOptimizer(self, learning_rate, rho=0.95, epsilon=1e-07,
        name='Adadelta', **kwargs):
        """
        # Args
        learning_rate: A Tensor, floating point value, or a schedule that is a
            tf.keras.optimizers.schedules.LearningRateSchedule, or a callable
             that takes no arguments and returns the actual value to use, 
             The learning rate. Defaults to 0.001.
        beta_1: A float value or a constant float tensor, or a callable that
            takes no arguments and returns the actual value to use. The 
            exponential decay rate for the 1st moment estimates. 
            Defaults to 0.9.
        beta_2: A float value or a constant float tensor, or a callable that
            takes no arguments and returns the actual value to use, The 
            exponential decay rate for the 2nd moment estimates. 
            Defaults to 0.999.
        epsilon: A small constant for numerical stability. This epsilon is
            "epsilon hat" in the Kingma and Ba paper (in the formula just 
            before Section 2.1), not the epsilon in Algorithm 1 of the paper.
            Defaults to 1e-7.
        amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm 
            from the paper "On the Convergence of Adam and beyond". 
            Defaults to False.
        name: Optional name for the operations created when applying gradients.
            Defaults to "Adam".
        **kwargs: Keyword arguments. Allowed to be one of 
            "clipnorm" or "clipvalue". "clipnorm" (float) clips gradients 
            by norm; "clipvalue" (float) clips gradients by value.
        """
        optimizer = tf.keras.optimizers.Adadelta(
            learning_rate=learning_rate, rho=rho, epsilon=epsilon, 
            name=name, **kwargs)
        return optimizer

    def AdagradOptimizer(self, learning_rate, 
        initial_accumulator_value=0.1, epsilon=1e-07, 
        name='Adagrad', **kwargs):
        """
        Args:
        learning_rate: A Tensor, floating point value, or a schedule that is
            a tf.keras.optimizers.schedules.LearningRateSchedule. 
            The learning rate.
        initial_accumulator_value: A floating point value. Starting value 
            for the accumulators, must be non-negative.
        epsilon: A small floating point value to avoid zero denominator.
        name: Optional name prefix for the operations created when applying
            gradients. Defaults to "Adagrad".
        **kwargs: Keyword arguments. Allowed to be one of "clipnorm" or 
            "clipvalue". "clipnorm" (float) clips gradients by norm; 
            "clipvalue" (float) clips gradients by value.
        """
        optimizer = tf.keras.optimizers.Adagrad(
            learning_rate=learning_rate, 
            initial_accumulator_value=initial_accumulator_value, 
            epsilon=epsilon, name=name, **kwargs)
        return optimizer

    def AdamaxOptimizer(self, learning_rate, beta_1=0.9, beta_2=0.999,
        epsilon=1e-07, name='Adamax', **kwargs):
        """
        # Args
        learning_rate: A Tensor, floating point value, or a schedule that is a
            tf.keras.optimizers.schedules.LearningRateSchedule. 
            The learning rate.
        beta_1: A float value or a constant float tensor. 
            The exponential decay rate for the 1st moment estimates.
        beta_2: A float value or a constant float tensor. 
            The exponential decay rate for the exponentially weighted 
            infinity norm.
        epsilon: A small constant for numerical stability.
        name: Optional name for the operations created when applying gradients.
            Defaults to "Adamax".
        **kwargs: Keyword arguments. Allowed to be one of "clipnorm" or 
            "clipvalue". "clipnorm" (float) clips gradients by norm; 
            "clipvalue" (float) clips gradients by value.
        """
        optimizer = tf.keras.optimizers.Adamax(
            learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2,
            epsilon=epsilon, name=name, **kwargs)
        return optimizer

    # The Nadam optimizer does not support 
    # tf.keras.optimizers.LearningRateSchedules as the learning rate.
    def NadamOptimizer(self, learning_rate, beta_1=0.9, beta_2=0.999,
        epsilon=1e-07, name='Nadam', **kwargs):
        """
        # Definition
        Adam + Nesterov momentum
        # Args
        learning_rate: A Tensor or a floating point value. The learning rate.
        beta_1: A float value or a constant float tensor. 
            The exponential decay rate for the 1st moment estimates.
        beta_2: A float value or a constant float tensor. 
            The exponential decay rate for the exponentially 
            weighted infinity norm.
        epsilon: A small constant for numerical stability.
        name: Optional name for the operations created when applying gradients.
            Defaults to "Nadam".
        **kwargs: Keyword arguments. Allowed to be one of "clipnorm" or 
            "clipvalue". "clipnorm" (float) clips gradients by norm; 
            "clipvalue" (float) clips gradients by value.
        """
        optimizer = tf.keras.optimizers.Nadam(
            learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2,
            epsilon=epsilon, name=name, **kwargs)
        return optimizer

    def RMSpropOptimizer(self, learning_rate, rho=0.9, momentum=0.0,
        epsilon=1e-07, centered=False, name='RMSprop', **kwargs):
        """
        # Args
        learning_rate: A Tensor, floating point value, or a schedule that is 
            a tf.keras.optimizers.schedules.LearningRateSchedule, or a callable
            that takes no arguments and returns the actual value to use. 
            The learning rate. Defeaults to 0.001.
        rho: Discounting factor for the history/coming gradient. 
            Defaults to 0.9.
        momentum: A scalar or a scalar Tensor. Defaults to 0.0.
        epsilon: A small constant for numerical stability. This epsilon is 
            "epsilon hat" in the Kingma and Ba paper (in the formula just 
            before Section 2.1), not the epsilon in Algorithm 1 of the paper. 
            Defaults to 1e-7.
        centered: Boolean. If True, gradients are normalized by the estimated 
            variance of the gradient; if False, 
            by the uncentered second moment. Setting this to True may help 
            with training, but is slightly more expensive in terms of 
            computation and memory. Defaults to False.
        name: Optional name prefix for the operations created when applying 
            gradients. Defaults to "RMSprop".
        **kwargs: Keyword arguments. Allowed to be one of "clipnorm" or 
            "clipvalue". "clipnorm" (float) clips gradients by norm; 
            "clipvalue" (float) clips gradients by value.
        """
        optimizer = tf.keras.optimizers.RMSprop(
            learning_rate=learning_rate, rho=rho, momentum=momentum, 
            epsilon=epsilon, centered=centered, name=name, **kwargs)
        return optimizer

    def COCOBOptimizer(self, learning_rate=None, alpha=100., use_locking=False,
        name='COCOB', **kwargs):
        """
        # Definition
        https://arxiv.org/abs/1705.07795
        # Args
        learning_rate: Dummy arg. Not used.
        alpha: Default value is set to 100 as per paper 
        (https://arxiv.org/abs/1705.07795). This has the effect of restricting
        the value of the parameters in the first iterations of the algorithm. 
        """
        optimizer = tfa.optimizers.COCOB(
            alpha=alpha,
            use_locking=use_locking,
            name=name,
            **kwargs)
        return optimizer

    def ConditionalGradientOptimizer(self, learning_rate, lambda_=0.01, 
        epsilon=1e-07, ord='fro', name='ConditionalGradient', **kwargs):
        """
        # Definition
        This optimizer helps handle constraints well.
        Currently only supports frobenius norm constraint or nuclear norm
         constraint. See https://arxiv.org/pdf/1803.06453.pdf
        variable -= (1-learning_rate) * (variable + lambda_ * gradient
            / (frobenius_norm(gradient) + epsilon))
        Note that lambda_ here refers to the constraint "lambda" in the paper.
        epsilon is constant with tiny value as compared to the value of
        frobenius norm of gradient. The purpose of epsilon here is to avoid the
        case that the value of frobenius norm of gradient is 0.
        In this implementation, epsilon defaults to .
        For nucler norm constraint, the formula is as following:
        variable -= (1-learning_rate) * (variable
            + lambda_ * top_singular_vector(gradient))
        # Args
        learning_rate: A Tensor or a floating point value. or a schedule that
            is a tf.keras.optimizers.schedules.LearningRateSchedule 
        lambda_: A Tensor or a floating point value. The constraint.
        epsilon: A Tensor or a floating point value. A small constant for 
            numerical stability when handling the case of norm of gradient 
            to be zero.
        ord: Order of the norm. Supported values are 'fro' and 'nuclear'. 
            Default is 'fro', which is frobenius norm.
        name: Optional name prefix for the operations created when applying 
            gradients. Defaults to 'ConditionalGradient'.
        **kwargs: keyword arguments. Allowed to be {clipnorm, clipvalue, lr, 
            decay}. clipnorm is clip gradients by norm; clipvalue is clip 
            gradients by value, decay is included for backward compatibility to
            allow time inverse decay of learning rate. lr is included for 
            backward compatibility, recommended to use learning_rate instead.
        """
        optimizer = tfa.optimizers.ConditionalGradient(
            learning_rate=learning_rate,
            lambda_=lambda_,
            epsilon=epsilon,
            ord=ord,
            name=name,
            **kwargs)
        return optimizer

    def LAMBOptimizer(self, learning_rate, beta_1=0.9, beta_2=0.999,
        epsilon=1e-06, weight_decay_rate=0.0, 
        exclude_from_weight_decay=None, exclude_from_layer_adaptation=None,
        name='LAMB', **kwargs):
        """
        # Definition
        Optimizer that implements the Layer-wise Adaptive Moments (LAMB).
        See paper https://arxiv.org/abs/1904.00962
        # Args
        learning_rate: A Tensor or a floating point value. or a schedule that
            is a tf.keras.optimizers.schedules.LearningRateSchedule 
            The learning rate.
        beta_1: A float value or a constant float tensor. The exponential decay
            rate for the 1st moment estimates.
        beta_2: A float value or a constant float tensor. The exponential decay
            rate for the 2nd moment estimates.
        epsilon: A small constant for numerical stability.
        weight_decay_rate: weight decay rate.
        exclude_from_weight_decay: List of regex patterns of variables excluded
            from weight decay. Variables whose name contain a substring 
            matching the pattern will be excluded.
        exclude_from_layer_adaptation: List of regex patterns of variables
            excluded from layer adaptation. Variables whose name contain a 
            substring matching the pattern will be excluded.
        name: Optional name for the operations created when applying gradients.
            Defaults to "LAMB".
        **kwargs: keyword arguments. Allowed to be {clipnorm, clipvalue, lr,
            decay}. clipnorm is clip gradients by norm; clipvalue is clip 
            gradients by value, decay is included for backward compatibility 
            to allow time inverse decay of learning rate. lr is included for 
            backward compatibility, recommended to use learning_rate instead.
        """
        optimizer = tfa.optimizers.LAMB(
            learning_rate=learning_rate,
            beta_1=beta_1,
            beta_2=beta_2,
            epsilon=epsilon,
            weight_decay_rate=weight_decay_rate,
            exclude_from_weight_decay=exclude_from_weight_decay,
            exclude_from_layer_adaptation=exclude_from_layer_adaptation,
            name=name,
            **kwargs)
        return optimizer

    def LazyAdamOptimizer(self, learning_rate, beta_1=0.9, beta_2=0.999,
        epsilon=1e-07, amsgrad=False, name='LazyAdam', **kwargs):
        """
        # Definition
        Variant of the Adam optimizer that handles sparse updates more 
        efficiently. The original Adam algorithm maintains two moving-average
        accumulators for each trainable variable; the accumulators are updated
        at every step. This class provides lazier handling of gradient updates
        for sparse variables. It only updates moving-average accumulators for 
        sparse variable indices that appear in the current batch, rather than
        updating the accumulators for all indices. Compared with the original
        Adam optimizer, it can provide large improvements in model training
        throughput for some applications. However, it provides slightly
        different semantics than the original Adam algorithm, and may lead
        to different empirical results. Note, amsgrad is currently not 
        supported and the argument can only be False.
        # Args
        learning_rate	A Tensor or a floating point value. or a schedule that
            is a tf.keras.optimizers.schedules.LearningRateSchedule 
            The learning rate.
        beta_1: A float value or a constant float tensor. The exponential decay 
        rate for the 1st moment estimates.
        beta_2: A float value or a constant float tensor. The exponential decay 
            rate for the 2nd moment estimates.
        epsilon: A small constant for numerical stability. This epsilon is 
            "epsilon hat" in Adam: A Method for Stochastic Optimization. 
            Kingma et al., 2014 (in the formula just before Section 2.1), 
            not the epsilon in Algorithm 1 of the paper.
        amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
            the paper "On the Convergence of Adam and beyond". Note that this 
            argument is currently not supported and the argument can only be 
            False.
        name: Optional name for the operations created when applying gradients. 
            Defaults to "LazyAdam".
        **kwargs: keyword arguments. Allowed to be {clipnorm, clipvalue, lr,
            decay}. clipnorm is clip gradients by norm; clipvalue is clip
            gradients by value, decay is included for backward compatibility
            to allow time inverse decay of learning rate. lr is included for
            backward compatibility, recommended to use learning_rate instead.
        """
        optimizer = tfa.optimizers.LazyAdam(
            learning_rate=learning_rate,
            beta_1=beta_1,
            beta_2=beta_2,
            epsilon=epsilon,
            amsgrad=amsgrad,
            name=name,
            **kwargs)
        return optimizer

    def NovoGradOptimizer(self, 
        learning_rate,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-07,
        weight_decay=0.0,
        grad_averaging=False,
        amsgrad=False,
        name='NovoGrad',
        **kwargs):
        """
        # Definition
        The NovoGrad Optimizer was first proposed in Stochastic Gradient 
        Methods with Layerwise Adaptive Moments for training of Deep Networks 
        (https://arxiv.org/pdf/1905.11286.pdf). NovoGrad is a first-order 
        SGD-based algorithm, which computes second moments per layer instead of
        per weight as in Adam. Compared to Adam, NovoGrad takes less memory, 
        and has been found to be more numerically stable. (For more information
        on the computation please refer to 
        https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html#novograd)
        Second order moment = exponential moving average of Layer-wise square 
        of grads: v_t <-- beta2 * v{t-1} + (1-beta_2) * (g_t)^2 First order 
        moment in one of four modes:
        1. moment of grads normalized by v_t:
            m_t <- beta_1 * m_{t-1} + [ g_t / (sqrt(v_t)+epsilon)]
        2. moment similar to Adam: exponential moving average of grads
        normalized by v_t (set grad_averaging = True to use this):
            m_t <- beta_1 * m_{t-1} +
                [(1 - beta_1) * (g_t / (sqrt(v_t) + epsilon))]
        3. weight decay adds a w_d term after grads are rescaled by
        1/sqrt(v_t) (set weight_decay > 0 to use this0:
            m_t <- beta_1 * m_{t-1} +
                [(g_t / (sqrt(v_t) + epsilon)) + (w_d * w_{t-1})]
        4. weight decay + exponential moving average from Adam:
            m_t <- beta_1 * m_{t-1} +
                [(1 - beta_1) * ((g_t / (sqrt(v_t + epsilon)) +
                (w_d * w_{t-1}))]
        Weight update: wt <- w{t-1} - lr_t * m_t                                    
        # Args
        learning_rate	A Tensor or a floating point value. or a schedule that
            is a tf.keras.optimizers.schedules.LearningRateSchedule 
            The learning rate.
        beta_1	A float value or a constant float tensor. The exponential decay
            rate for the 1st moment estimates.
        beta_2	A float value or a constant float tensor. The exponential decay
            rate for the 2nd moment estimates.
        epsilon	A small constant for numerical stability.
        weight_decay	A floating point value. Weight decay for each param.
        grad_averaging	determines whether to use Adam style exponential moving
            averaging for the first order moments.
        **kwargs	keyword arguments. Allowed to be {clipnorm, clipvalue, lr,
            decay}. clipnorm is clip gradients by norm; clipvalue is clip 
            gradients by value, decay is included for backward compatibility 
            to allow time inverse decay of learning rate. lr is included for 
            backward compatibility, recommended to use learning_rate instead.
        """
        optimizer = tfa.optimizers.NovoGrad(
            learning_rate=learning_rate,
            beta_1=beta_1,
            beta_2=beta_2,
            epsilon=epsilon,
            weight_decay=weight_decay,
            grad_averaging=grad_averaging,
            amsgrad=amsgrad,
            name=name,
            **kwargs)
        return optimizer

    def ProximalAdagradOptimizer(self,
        learning_rate,
        initial_accumulator_value=0.1,
        l1_regularization_strength=0.0,
        l2_regularization_strength=0.0,
        name='ProximalAdagrad',
        **kwargs):
        """
        # Definition
        Efficient Learning using Forward-Backward Splitting 
        (https://proceedings.neurips.cc/paper/2009/file/621bf66ddb7c962aa0d22ac97d69b793-Paper.pdf).
        # Args
        learning_rate: A Tensor or a floating point value, or a schedule that 
            is a tf.keras.optimizers.schedules.LearningRateSchedule. 
            The learning rate.
        initial_accumulator_value: A floating point value. Starting value for 
            the accumulators, must be positive.
        l1_regularization_strength: A floating point value. The l1 
            regularization term, must be greater than or equal to zero.
        l2_regularization_strength: A floating point value. The l2 
            regularization term, must be greater than or equal to zero.
        name: Optional name for the operations created when applying gradients.
            Defaults to "ProximalAdagrad".
        **kwargs: keyword arguments. Allowed to be {clipnorm, clipvalue, lr, 
            decay}. clipnorm is clip gradients by norm; clipvalue is clip 
            gradients by value, decay is included for backward compatibility 
            to allow time inverse decay of learning rate. lr is included for 
            backward compatibility, recommended to use learning_rate instead.
        """
        optimizer = tfa.optimizers.ProximalAdagrad(
            learning_rate=learning_rate,
            initial_accumulator_value=initial_accumulator_value,
            l1_regularization_strength=l1_regularization_strength,
            l2_regularization_strength=l2_regularization_strength,
            name=name,
            **kwargs)
        return optimizer

    def RectifiedAdamOptimizer(self, learning_rate, beta_1=0.9, beta_2=0.999,
        epsilon=1e-07, weight_decay=0.0, amsgrad=False, sma_threshold=5.0,
        total_steps=0, warmup_proportion=0.1, min_lr=0.0, name='RectifiedAdam',
        **kwargs):
        """
        # Definition
        Variant of the Adam optimizer whose adaptive learning rate is rectifie
        d so as to have a consistent variance.
        It implements the Rectified Adam (a.k.a. RAdam) proposed by 
        Liyuan Liu et al. in On The Variance Of The Adaptive Learning Rate 
        And Beyond (https://arxiv.org/pdf/1908.03265v1.pdf).
        RAdam is not a placement of the heuristic warmup, the settings should
        be kept if warmup has already been employed and tuned in the baseline
        method. You can enable warmup by setting total_steps 
        and warmup_proportion:
        opt = tfa.optimizers.RectifiedAdam(
            lr=1e-3,
            total_steps=10000,
            warmup_proportion=0.1,
            min_lr=1e-5,)
        In the above example, the learning rate will increase linearly from 0
        to lr in 1000 steps, then decrease linearly from lr to min_lr in 
        9000 steps.
        # Args
        learning_rate: A Tensor or a floating point value, or a schedule that
            is a tf.keras.optimizers.schedules.LearningRateSchedule. 
            The learning rate.
        beta_1: A float value or a constant float tensor. The exponential 
            decay rate for the 1st moment estimates.
        beta_2: A float value or a constant float tensor. The exponential 
            decay rate for the 2nd moment estimates.
        epsilon: A small constant for numerical stability.
        weight_decay: A Tensor or a floating point value, or a schedule that is
            a tf.keras.optimizers.schedules.LearningRateSchedule. 
            Weight decay for each parameter.
        amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm 
            from the paper "On the Convergence of Adam and beyond". 
        sma_threshold: A float value. The threshold for
            simple mean average.
        total_steps: An integer value. Total number of training steps. 
            Enable warmup by setting a positive value.
        warmup_proportion: A floating point value. The proportion of 
            increasing steps.
        min_lr: A floating point value. Minimum learning rate after warmup.
        name: Optional name for the operations created when applying gradients.
            Defaults to "RectifiedAdam".
        **kwargs: keyword arguments. Allowed to be {clipnorm, clipvalue, lr, 
            decay}. clipnorm is clip gradients by norm; clipvalue is clip 
            gradients by value, decay is included for backward compatibility
            to allow time inverse decay of learning rate. lr is included for
            backward compatibility, recommended to use learning_rate instead.
        """
        optimizer = tfa.optimizers.RectifiedAdam(
            learning_rate=learning_rate,
            beta_1=beta_1,
            beta_2=beta_2,
            epsilon=epsilon,
            weight_decay=weight_decay,
            amsgrad=amsgrad,
            sma_threshold=sma_threshold,
            total_steps=total_steps,
            warmup_proportion=warmup_proportion,
            min_lr=min_lr,
            name=name,
            **kwargs)
        return optimizer

    def YogiOptimizer(self,
        learning_rate,
        beta1=0.9,
        beta2=0.999,
        epsilon=0.001,
        l1_regularization_strength=0.0,
        l2_regularization_strength=0.0,
        initial_accumulator_value=1e-06,
        activation='sign',
        name='Yogi',
        **kwargs):
        """
        # Definition
        See Algorithm 2 of 
        https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf
        # Args
        learning_rate: A Tensor or a floating point value or a schedule that
            is a tf.keras.optimizers.schedules.LearningRateSchedule. 
            The learning rate.
        beta1: A float value or a constant float tensor. 
            The exponential decay rate for the 1st moment estimates.
        beta2: A float value or a constant float tensor. 
            The exponential decay rate for the 2nd moment estimates.
        epsilon: A constant trading off adaptivity and noise.
        l1_regularization_strength: A float value, must be greater than or 
            equal to zero.
        l2_regularization_strength: A float value, must be greater than or 
            equal to zero.
        initial_accumulator_value: The starting value for accumulators. 
            Only positive values are allowed.
        activation: Use hard sign or soft tanh to determin sign.
        name: Optional name for the operations created when applying 
            gradients. Defaults to "Yogi".
        **kwargs: keyword arguments. Allowed to be {clipnorm, clipvalue, lr,
            decay}. clipnorm is clip gradients by norm; clipvalue is clip 
            gradients by value, decay is included for backward compatibility
            to allow time inverse decay of learning rate. lr is included for 
            backward compatibility, recommended to use learning_rate instead.

        """
        optimizer = tfa.optimizers.Yogi(
            learning_rate=learning_rate,
            beta1=beta1,
            beta2=beta2,
            epsilon=epsilon,
            l1_regularization_strength=l1_regularization_strength,
            l2_regularization_strength=l2_regularization_strength,
            initial_accumulator_value=initial_accumulator_value,
            activation=activation,
            name=name,
            **kwargs)
        return optimizer

