Source code for symjax.nn.optimizers

import numpy

import symjax
from symjax import tensor
from ..base import gradients
from symjax.nn.schedules import ExponentialMovingAverage


[docs]def conjugate_gradients(Ax, b):
    """
    Conjugate gradient algorithm
    (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
    """

    def ones_step():
        z = Ax(p)
        alpha = r_dot_old / (np.dot(p, z) + EPS)
        x += alpha * p
        r -= alpha * z
        r_dot_new = np.dot(r, r)
        p = r + (r_dot_new / r_dot_old) * p
        r_dot_old = r_dot_new

    x = T.zeros_like(b)
    r = (
        b.copy()
    )  # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
    p = r.copy()
    r_dot_old = np.dot(r, r)
    for _ in range(cg_iters):
        z = Ax(p)
        alpha = r_dot_old / (np.dot(p, z) + EPS)
        x += alpha * p
        r -= alpha * z
        r_dot_new = np.dot(r, r)
        p = r + (r_dot_new / r_dot_old) * p
        r_dot_old = r_dot_new
    return x


class Optimizer:
    def __init__(self, *args, name=None, **kwargs):

        if name is None:
            name = self.__NAME__
        with symjax.Scope(name):
            self.create_updates(*args, **kwargs)
            self._scope_name = symjax.current_graph().scope_name

    def reset(self):
        if hasattr(self, "variables"):
            for var in self.variables:
                var.reset()

    @property
    def updates(self):
        return symjax.get_updates(scope=self._scope_name)
        if hasattr(self, "_updates"):
            return self._updates
        else:
            self._updates = {}
            return self._updates

    def _get_grads(self, grads_or_loss, params):
        # get grads if given is loss
        if (
            isinstance(grads_or_loss, tuple)
            or isinstance(grads_or_loss, list)
            or isinstance(grads_or_loss, tensor.MultiOutputOp)
        ):
            return grads_or_loss
        elif isinstance(grads_or_loss, tensor.Tensor):
            return gradients(grads_or_loss, params)
        else:
            return grads_or_loss

    def _get_variables(self, loss):

        params = symjax.get_variables(trainable=True)

        params = [p for p in params if symjax.current_graph().is_connected(p, loss)]
        return params

    def add_updates(self, update):
        if not hasattr(self, "_update"):
            self._updates = {}
        self._updates.update(update)
        symjax.current_graph().add_updates(update)


[docs]class SGD(Optimizer):
    """Stochastic gradient descent optimization.

    Notice that SGD is also the acronym employed in ``tf.keras.optimizers.SGD``
    and in ``torch.optim.sgd`` but might be misleading. In fact, those
    and this implementation implement GD, the SGD term only applies if one
    performs GD optimization only using 1 (random) sample to compute the gradients.
    If multiple samples are used it is commonly referred as mini-batch GD and
    when the entire dataset is used then the optimizer is refered as GD. See
    an illustrative discussion `here <https://towardsdatascience.com/difference-between-batch-gradient-descent-and-stochastic-gradient-descent-1187f1291aa1>`_.

    The produced update for parameter θ and a given learning rate α is:

    .. math::
        θ = θ - α  ∇_{θ} L

    Parameters
    ----------

    grads_or_loss: scalar tensor or list of gradients
        either the loss (scalar of Tensor type) to be differentied
        or the list of gradients already computed and possibly altered
        manually (such as clipping)

    learning_rate: constant or Tensor
        the learning rate use to update the parameters

    params: list (optional)
        if grads_or_loss is al list then it should be ordered w.r.t. the
        given parameters

    Attributes
    ----------

    updates: list of updates

    variables: list of variables

    """

    __NAME__ = "SGDOptimizer"

    def create_updates(self, grads_or_loss, learning_rate, params=None):

        if isinstance(grads_or_loss, list):
            assert params

        if params is None:
            params = self._get_variables(grads_or_loss)
        elif type(params) != list:
            raise RuntimeError("given params should be a list")

        grads = self._get_grads(grads_or_loss, params)

        updates = dict()
        for param, grad in zip(params, grads):
            updates[param] = param - learning_rate * grad

        self.add_updates(updates)


[docs]class NesterovMomentum(Optimizer):
    """Nesterov momentum Optimization

    Parameters
    ----------

    grads_or_loss: scalar tensor or list of gradients
        either the loss (scalar of Tensor type) to be differentied
        or the list of gradients already computed and possibly altered
        manually (such as clipping)

    learning_rate: constant or Tensor
        the learning rate use to update the parameters

    momentum: constant or Tensor
        the amount of momentum to be applied

    params: list (optional)
        if grads_or_loss is al list then it should be ordered w.r.t. the
        given parameters


    Attributes
    ----------

    updates: list of updates

    variables: list of variables

    """

    __NAME__ = "NesterovMomentumOptimizer"

    def create_updates(self, grads_or_loss, learning_rate, momentum, params=None):

        if isinstance(grads_or_loss, list):
            assert params
        if params is None:
            params = self._get_variables(grads_or_loss)
        elif type(params) != list:
            raise RuntimeError("given params should be a list")

        grads = self._get_grads(grads_or_loss, params)

        updates = dict()
        variables = []
        for param, grad in zip(params, grads):
            velocity = tensor.Variable(
                numpy.zeros(param.shape, dtype=param.dtype), trainable=False
            )
            variables.append(velocity)
            update = param - learning_rate * grad
            x = momentum * velocity + update - param
            updates[velocity] = x
            updates[param] = momentum * x + update

        self.add_updates(updates)


[docs]class Adam(Optimizer):
    """Adaptive Gradient Based Optimization with renormalization.

    The update rule for `variable` with gradient `g` uses an optimization
    described at the end of section 2 of the paper with learning rate
    α.

    If ``amsgrad`` is ``False``:

    **initialization**:

        - :math:`m_0 = 0` (Initialize initial 1st moment vector)
        - :math:`v_0 = 0` (Initialize initial 2nd moment vector)
        - :math:`t = 0` (Initialize timestep)

    **update**:

        - :math:`t = t + 1`
        - :math:`α_t = α × \sqrt{1 - β_2^t}/(1 - β_1^t)`
        - :math:`m_t = β_1 × m_{t-1} + (1 - β_1) × g`
        - :math:`v_t = β_2 × v_{t-1} + (1 - β_2) × g \odot g`
        - :math:`variable = variable - α_t × m_t / (\sqrt{v_t} + ε)`

    If ``amsgrad`` is ``True``:

    **initialization**:

        - :math:`m_0 = 0` (Initialize initial 1st moment vector)
        - :math:`v_0 = 0` (Initialize initial 2nd moment vector)
        - :math:`v'_0 = 0` (Initialize initial 2nd moment vector)
        - :math:`t = 0` (Initialize timestep)

    **update**:

        - :math:`t = t + 1`
        - :math:`α_t = α × \sqrt{1 - β_2^t}/(1 - β_1^t)`
        - :math:`m_t = β_1 × m_{t-1} + (1 - β_1) × g`
        - :math:`v_t = β_2 × v_{t-1} + (1 - β_2) × g \odot g`
        - :math:`v'_t := \max(v'_{t-1}, v_t)`
        - :math:`variable = variable - α_t × m_t / (\sqrt{v'_t} + ε)`

    The default value of :math:`\epsilon=1e-7` might not be a good default in
    general. For example, when training an Inception network on ImageNet a
    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
    formulation just before Section 2.1 of the Kingma and Ba paper rather than
    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
    hat" in the paper.

    Parameters
    ----------

    grads_or_loss: scalar tensor or list of gradients
        either the loss (scalar of Tensor type) to be differentied
        or the list of gradients already computed and possibly altered
        manually (such as clipping)

    learning_rate (α): constant or Tensor
        the learning rate use to update the parameters

    amsgrad: bool
        whether to use the amsgrad updates or not

    β_1: constant or Tensor
        the value of the exponential moving average of the average of the
        gradients through time (updates)

    β_2: constant or Tensor
        the value of the exponential moving average of the variance of the
        gradients through time

    ε : constant or Tensor
        the value added to the second order moment

    params: list (optional)
        if grads_or_loss is al list then it should be ordered w.r.t. the
        given parameters, if not given then the optimizer will find
        all variables that are traininable and involved with the
        given loss

    Attributes
    ----------

    updates: list of updates

    variables: list of variables

    """

    __NAME__ = "AdamOptimizer"

    def create_updates(
        self,
        grads_or_loss,
        learning_rate,
        amsgrad=False,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-7,
        params=None,
    ):

        if isinstance(grads_or_loss, list):
            assert params
        if params is None:
            params = self._get_variables(grads_or_loss)
        elif type(params) != list:
            raise RuntimeError("given params should be a list")

        if len(params) == 0:
            raise RuntimeError(
                "no parameters are given for the gradients, this can be due to passing explicitly an empty list or to passing a lost connected to no trainable weights"
            )
        grads = self._get_grads(grads_or_loss, params)

        local_step = tensor.Variable(1, dtype="int32", trainable=False)
        updates = {local_step: local_step + 1}

        beta_1_t = tensor.power(beta_1, local_step)
        beta_2_t = tensor.power(beta_2, local_step)
        lr = learning_rate * (tensor.sqrt(1 - beta_2_t) / (1 - beta_1_t))

        for param, grad in zip(params, grads):
            m = ExponentialMovingAverage(grad, beta_1, debias=False)[0]
            v = ExponentialMovingAverage(grad ** 2, beta_2, debias=False)[0]
            if amsgrad:
                v_hat = tensor.Variable(
                    tensor.zeros_like(param), name="v_hat", trainable=False
                )
                updates[v_hat] = tensor.maximum(v_hat, v)
                update = m / (tensor.sqrt(updates[v_hat]) + epsilon)
            else:
                update = m / (tensor.sqrt(v) + epsilon)
            update = tensor.where(local_step == 1, grad, update)
            updates[param] = param - lr * update

        self.add_updates(updates)