Source code for symjax.nn.optimizers

import numpy

import symjax
from symjax import tensor
from ..base import gradients
from symjax.nn.schedules import ExponentialMovingAverage


[docs]def conjugate_gradients(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ def ones_step(): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new x = T.zeros_like(b) r = ( b.copy() ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x
class Optimizer: def __init__(self, *args, name=None, **kwargs): if name is None: name = self.__NAME__ with symjax.Scope(name): self.create_updates(*args, **kwargs) self._scope_name = symjax.current_graph().scope_name def reset(self): if hasattr(self, "variables"): for var in self.variables: var.reset() @property def updates(self): return symjax.get_updates(scope=self._scope_name) if hasattr(self, "_updates"): return self._updates else: self._updates = {} return self._updates def _get_grads(self, grads_or_loss, params): # get grads if given is loss if ( isinstance(grads_or_loss, tuple) or isinstance(grads_or_loss, list) or isinstance(grads_or_loss, tensor.MultiOutputOp) ): return grads_or_loss elif isinstance(grads_or_loss, tensor.Tensor): return gradients(grads_or_loss, params) else: return grads_or_loss def _get_variables(self, loss): params = symjax.get_variables(trainable=True) params = [p for p in params if symjax.current_graph().is_connected(p, loss)] return params def add_updates(self, update): if not hasattr(self, "_update"): self._updates = {} self._updates.update(update) symjax.current_graph().add_updates(update)
[docs]class SGD(Optimizer): """Stochastic gradient descent optimization. Notice that SGD is also the acronym employed in ``tf.keras.optimizers.SGD`` and in ``torch.optim.sgd`` but might be misleading. In fact, those and this implementation implement GD, the SGD term only applies if one performs GD optimization only using 1 (random) sample to compute the gradients. If multiple samples are used it is commonly referred as mini-batch GD and when the entire dataset is used then the optimizer is refered as GD. See an illustrative discussion `here <https://towardsdatascience.com/difference-between-batch-gradient-descent-and-stochastic-gradient-descent-1187f1291aa1>`_. The produced update for parameter θ and a given learning rate α is: .. math:: θ = θ - α ∇_{θ} L Parameters ---------- grads_or_loss: scalar tensor or list of gradients either the loss (scalar of Tensor type) to be differentied or the list of gradients already computed and possibly altered manually (such as clipping) learning_rate: constant or Tensor the learning rate use to update the parameters params: list (optional) if grads_or_loss is al list then it should be ordered w.r.t. the given parameters Attributes ---------- updates: list of updates variables: list of variables """ __NAME__ = "SGDOptimizer" def create_updates(self, grads_or_loss, learning_rate, params=None): if isinstance(grads_or_loss, list): assert params if params is None: params = self._get_variables(grads_or_loss) elif type(params) != list: raise RuntimeError("given params should be a list") grads = self._get_grads(grads_or_loss, params) updates = dict() for param, grad in zip(params, grads): updates[param] = param - learning_rate * grad self.add_updates(updates)
[docs]class NesterovMomentum(Optimizer): """Nesterov momentum Optimization Parameters ---------- grads_or_loss: scalar tensor or list of gradients either the loss (scalar of Tensor type) to be differentied or the list of gradients already computed and possibly altered manually (such as clipping) learning_rate: constant or Tensor the learning rate use to update the parameters momentum: constant or Tensor the amount of momentum to be applied params: list (optional) if grads_or_loss is al list then it should be ordered w.r.t. the given parameters Attributes ---------- updates: list of updates variables: list of variables """ __NAME__ = "NesterovMomentumOptimizer" def create_updates(self, grads_or_loss, learning_rate, momentum, params=None): if isinstance(grads_or_loss, list): assert params if params is None: params = self._get_variables(grads_or_loss) elif type(params) != list: raise RuntimeError("given params should be a list") grads = self._get_grads(grads_or_loss, params) updates = dict() variables = [] for param, grad in zip(params, grads): velocity = tensor.Variable( numpy.zeros(param.shape, dtype=param.dtype), trainable=False ) variables.append(velocity) update = param - learning_rate * grad x = momentum * velocity + update - param updates[velocity] = x updates[param] = momentum * x + update self.add_updates(updates)
[docs]class Adam(Optimizer): """Adaptive Gradient Based Optimization with renormalization. The update rule for `variable` with gradient `g` uses an optimization described at the end of section 2 of the paper with learning rate α. If ``amsgrad`` is ``False``: **initialization**: - :math:`m_0 = 0` (Initialize initial 1st moment vector) - :math:`v_0 = 0` (Initialize initial 2nd moment vector) - :math:`t = 0` (Initialize timestep) **update**: - :math:`t = t + 1` - :math:`α_t = α × \sqrt{1 - β_2^t}/(1 - β_1^t)` - :math:`m_t = β_1 × m_{t-1} + (1 - β_1) × g` - :math:`v_t = β_2 × v_{t-1} + (1 - β_2) × g \odot g` - :math:`variable = variable - α_t × m_t / (\sqrt{v_t} + ε)` If ``amsgrad`` is ``True``: **initialization**: - :math:`m_0 = 0` (Initialize initial 1st moment vector) - :math:`v_0 = 0` (Initialize initial 2nd moment vector) - :math:`v'_0 = 0` (Initialize initial 2nd moment vector) - :math:`t = 0` (Initialize timestep) **update**: - :math:`t = t + 1` - :math:`α_t = α × \sqrt{1 - β_2^t}/(1 - β_1^t)` - :math:`m_t = β_1 × m_{t-1} + (1 - β_1) × g` - :math:`v_t = β_2 × v_{t-1} + (1 - β_2) × g \odot g` - :math:`v'_t := \max(v'_{t-1}, v_t)` - :math:`variable = variable - α_t × m_t / (\sqrt{v'_t} + ε)` The default value of :math:`\epsilon=1e-7` might not be a good default in general. For example, when training an Inception network on ImageNet a current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the formulation just before Section 2.1 of the Kingma and Ba paper rather than the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon hat" in the paper. Parameters ---------- grads_or_loss: scalar tensor or list of gradients either the loss (scalar of Tensor type) to be differentied or the list of gradients already computed and possibly altered manually (such as clipping) learning_rate (α): constant or Tensor the learning rate use to update the parameters amsgrad: bool whether to use the amsgrad updates or not β_1: constant or Tensor the value of the exponential moving average of the average of the gradients through time (updates) β_2: constant or Tensor the value of the exponential moving average of the variance of the gradients through time ε : constant or Tensor the value added to the second order moment params: list (optional) if grads_or_loss is al list then it should be ordered w.r.t. the given parameters, if not given then the optimizer will find all variables that are traininable and involved with the given loss Attributes ---------- updates: list of updates variables: list of variables """ __NAME__ = "AdamOptimizer" def create_updates( self, grads_or_loss, learning_rate, amsgrad=False, beta_1=0.9, beta_2=0.999, epsilon=1e-7, params=None, ): if isinstance(grads_or_loss, list): assert params if params is None: params = self._get_variables(grads_or_loss) elif type(params) != list: raise RuntimeError("given params should be a list") if len(params) == 0: raise RuntimeError( "no parameters are given for the gradients, this can be due to passing explicitly an empty list or to passing a lost connected to no trainable weights" ) grads = self._get_grads(grads_or_loss, params) local_step = tensor.Variable(1, dtype="int32", trainable=False) updates = {local_step: local_step + 1} beta_1_t = tensor.power(beta_1, local_step) beta_2_t = tensor.power(beta_2, local_step) lr = learning_rate * (tensor.sqrt(1 - beta_2_t) / (1 - beta_1_t)) for param, grad in zip(params, grads): m = ExponentialMovingAverage(grad, beta_1, debias=False)[0] v = ExponentialMovingAverage(grad ** 2, beta_2, debias=False)[0] if amsgrad: v_hat = tensor.Variable( tensor.zeros_like(param), name="v_hat", trainable=False ) updates[v_hat] = tensor.maximum(v_hat, v) update = m / (tensor.sqrt(updates[v_hat]) + epsilon) else: update = m / (tensor.sqrt(v) + epsilon) update = tensor.where(local_step == 1, grad, update) updates[param] = param - lr * update self.add_updates(updates)