Source code for symjax.nn.losses

from .. import tensor as T
import numpy as np
from .. import probabilities
from symjax import nn
from .ops_nn import softmax, log_softmax, softplus


[docs]def huber(targets, predictions, delta=1.0):
    """huber loss (regression).

    For each value x in `error=targets-predictions`, the following is calculated:

        - :math:`0.5 × x^2`     if :math:`|x| <= Δ`
        - :math:`0.5 × Δ^2 + Δ × (|x| - Δ)`  if :math:`|x| > Δ`

    leading to

    .. plot::

      import matplotlib.pyplot as plt
      import numpy as np
      x = np.linspace(-3, 3, 300)
      huber = lambda x, delta: np.where(np.abs(x)<= delta, 0.5*x**2, 0.5*delta**2+delta*(np.abs(x)-delta))
      plt.plot(x, huber(x, 0.5))
      plt.plot(x, huber(x, 1))
      plt.plot(x, huber(x, 2))
      plt.plot(x, x**2, '--k')
      plt.xlabel(r'$x$')
      plt.ylabel(r'huber$(x,\Delta)$')
      plt.legend([r'$\Delta=0.5$',r'$\Delta=1$',r'$\Delta=2$', r'$x^2$'])
      plt.tight_layout()
      plt.show()


    `Wikipedia <https://en.wikipedia.org/wiki/Huber_loss>`_


    Args:
      targets: The ground truth output tensor, same dimensions as 'predictions'.
      predictions: The predicted outputs.
      delta (Δ): `float`, the point where the huber loss function changes from a
        quadratic to linear.
    Returns:
      loss float, this has the same
      shape as `targets`
    """

    error = predictions - targets
    abs_error = T.abs(error)
    losses = T.where(
        abs_error <= delta, 0.5 * error ** 2, delta * (abs_error - 0.5 * delta)
    )
    return losses


[docs]def explained_variance(y, ypred, axis=None, epsilon=1e-6):
    """
    Computes fraction of variance that ypred explains about y.
    The formula is

    .. math::
        1 - Var[y-ypred] / Var[y]

    and in the special case of centered targets and predictions it becomes

    .. math::
        1 - \|y-ypred\|^2_2 / \|y\|_2^2

    hence it can be seen as an :math:`ℓ_2' loss rescaled by the energy in the targets.

    interpretation:

        - ev=0  =>  might as well have predicted zero
        - ev=1  =>  perfect prediction
        - ev<0  =>  worse than just predicting zero

    Parameters:
    -----------

    y: Tensor like
        true target

    ypred: Tensor like
        prediction

    axis: integer or None (default=None)
        the axis along which to compute the var, by default uses all axes

    epsilon (ϵ): float (default=1e-6)
        the added constant in the denominator

    Returns:
    --------

    .. math::
        1 - Var(y-ypred)/(Var(y)+ϵ)

    Notes:
    ------

    This is not a symmetric function

    """
    return 1 - (y - ypred).var(axis=axis) / (y.var(axis=axis) + epsilon)


[docs]def vae(x, x_hat, q_mean, q_cov, z_mean=None, z_cov=None, x_cov=None):
    """N samples of dimension D to latent space in K dimension with Gaussian distributions

    Parameters
    ----------

    x: array
        should be of shape (N, D)

    x_hat: array
        should be of shape (N, D)

    q_mean: array
        should be of shape (N, K), infered mean of variational Gaussian

    q_cov: array
        should be of shape (N, K), infered log-variance of variational Gaussian

    z_mean: array
        should be of shape (K,), mean of z variable

    z_cov: array
        should be of shape (K,), logstd of z variable

    """
    if x_cov is None:
        x_cov = T.ones_like(x_hat)

    if z_mean is None:
        z_mean = T.zeros_like(q_mean)

    if z_cov is None:
        z_cov = T.ones_like(z_mean)

    p = probabilities.Normal(x, cov=x_cov)
    q = probabilities.Normal(q_mean, cov=q_cov)
    z = probabilities.Normal(z_mean, cov=z_cov)

    loss = -(p.log_prob(x_hat) - probabilities.KL(z, q))

    return loss


[docs]def vae_gmm(x, x_hat, z_mu, z_logvar, mu, logvar, logpi, logvar_x=0.0, eps=1e-8):
    """N samples of dimension D to latent space of C sluters in K dimension

    Parameters
    ----------

    x: array
        should be of shape (N, D)

    x_hat: array
        should be of shape (N, D)

    z_mu: array
        should be of shape (N, K), infered mean of variational Gaussian

    z_logvar: array
        should be of shape (N, K), infered log-variance of variational Gaussian

    mu: array
        should be of shape (C, K), parameter (centroids)

    logvar: array
        should be of shape (C, K), parameter (logvar of clusters)

    logpi: array
        should be of shape (C,), parameter (prior of clusters)
        :param logvar_x:
        :param eps:

    """

    var = T.exp(logvar)
    z_var = T.exp(z_logvar)

    # predict the log probability of clusters, shape will be (N, C)
    # and compute compute p(t|z) = p(z|t)p(t)/(\sum_t p(z|t)p(t))
    logprob = (
        logpi[:, None]
        - 0.5 * (T.log(2 * np.pi) + logvar)
        - (z_mu[:, None, :] - mu) ** 2 / (2 * var)
    ).sum(2)
    pt_z = nn.softmax(logprob)

    # E_{q(z,c|x)}[log(p(x|z))]
    px_z = -0.5 * ((x - x_hat) ** 2 / T.exp(logvar_x) + logvar_x).sum(1)

    # - E_{q(z,c|x)}[log(q(c|x))] entropy of categorical
    h_c = -(pt_z * nn.log_softmax(logprob)).sum(1)

    # - E_{q(z,c|x)}[log(q(z|x))] : entropy of normal
    h_z = 0.5 * z_logvar.sum(1)

    # E_{q(z,c|x)}[log(p(z|c)]
    ll_z = -0.5 * (
        pt_z
        * (
            logvar + z_var[:, None, :] / var - 1 + (z_mu[:, None, :] - mu) ** 2 / var
        ).sum(-1)
    ).sum(-1)

    # E_{q(z,c|x)}[log(p(c)]
    p_c = (pt_z * logpi).sum(1)

    loss = -(px_z + ll_z + p_c + h_c + h_z)

    return loss, px_z + p_c, pt_z


[docs]def vae_comp_gmm(x, x_hat, z_mu, z_logvar, mu, logvar, logpi, logvar_x=0.0, eps=1e-8):
    """N samples of dimension D to latent space of I pieces each of C sluters
    in K dimension

    Parameters
    ----------

    x: array
        should be of shape (N, D)

    x_hat: array
        should be of shape (N, D)

    z_mu: array
        should be of shape (N, I, K), infered mean of variational Gaussian

    z_logvar: array
        should be of shape (N, I, K), infered log-variance of variational Gaussian

    mu: array
        should be of shape (I, C, K), parameter (centroids)

    logvar: array
        should be of shape (I, C, K), parameter (logvar of clusters)

    logpi: array
        should be of shape (I, C), parameter (prior of clusters)
        :param logvar_x:
        :param eps:

    """

    var = T.exp(logvar)
    z_var = T.exp(z_logvar)

    # predict the log probability of clusters, shape will be (N, I, C)
    # and compute compute p(t_i|z_i) = p(z_i|t_i)p(t_i)/(\sum_t_i p(z_i|t_i)p(t_i))
    logprob = (
        logpi[:, :, None]
        - 0.5 * (T.log(2 * np.pi) + logvar)
        - (z_mu[:, :, None, :] - mu) ** 2 / (2 * var)
    ).sum(3)
    pt_z = softmax(logprob)

    # E_{q(z,c|x)}[log(p(x|z))]
    px_z = ((x - x_hat) ** 2).sum(1)

    # - E_{q(z,c|x)}[log(q(c|x))] entropy of categorical
    h_c = -(pt_z * log_softmax(logprob)).sum((1, 2))

    # - E_{q(z,c|x)}[log(q(z|x))] : entropy of normal
    h_z = 0.5 * z_logvar.sum((1, 2))

    # E_{q(z,c|x)}[log(p(z|c)]
    ll_z = -0.5 * (
        pt_z
        * (
            logvar
            + z_var[:, :, None, :] / var
            - 1
            + (z_mu[:, :, None, :] - mu) ** 2 / var
        ).sum(-1)
    ).sum((1, 2))

    # E_{q(z,c|x)}[log(p(c)]
    p_c = (pt_z * logpi).sum((1, 2))

    loss = -(px_z + ll_z + p_c + h_c + h_z)

    return loss


def FGMM_VAE(
    x,
    x_rec,
    x_logvar,
    z_logvar,
    q_mu,
    q_logvar,
    mu,
    q_loggamma,
    q_logeta,
    logpi,
    logpia,
    mode="bernoulli",
    eps=1e-5,
):
    """N samples of dimension D to latent space of dimension K with F factors of C clusters
    in K dimension

    Parameters
    ----------

    x: array
        (observation) should be of shape (N, D)

    x_rec: array
        (reconstruction, output of decoder) should be of shape (N, D)

    x_logvar: array
        (parameter) should be of shape (D,)

    z_logvar: array
        (parameter) should be of shape (K,)

    q_mu: array
        should be of shape (N, K), infered mean of variational Gaussian

    q_logvar: array
        should be of shape (N, K), infered log-variance of variational Gaussian

    mu: array
        (parameter) should be of shape (F, C, K), parameter (centroids)

    q_loggamma: array
        should be of shape (N, F, C), parameter (prior of clusters)

    logpi: array
        (parameter) should be of shape (F, C), parameter (prior of clusters)

    logpia: array
        (parameters) should ve of shape (F,)
        :param q_logeta:
        :param mode:
        :param eps:

    """

    q_var = T.exp(q_logvar)
    q_gamma = T.exp(q_loggamma)
    q_eta = T.exp(q_logeta)
    z_var = T.exp(z_logvar)
    x_var = T.exp(x_logvar)
    pi = T.exp(logpi)
    pia = T.exp(logpia)

    K = z_var.shape[0]
    D = x_var.shape[0]
    F = logpi.shape[0]
    log2pi = T.log(2 * np.pi)

    # reconstruction part (first expectation)
    E1 = -0.5 * (((x - x_rec) ** 2 / x_var).sum(1) + x_logvar.sum() + D * log2pi)

    E2_1 = -0.5 * (log2pi + z_logvar + (q_var + q_mu ** 2) / z_var).sum(1)

    E2_2 = T.einsum("nf,nfc,fck,nk->n", q_eta, q_gamma, mu, q_mu / z_var)

    E2_3 = -0.5 * (T.einsum("nf,nfc,fck->nk", q_eta, q_gamma, mu ** 2) / z_var).sum(1)

    if mode == "bernoulli":
        q_gammaeta = T.einsum("nf,nfc->nfc", q_eta, q_gamma)
        corr = T.einsum("fcd,nfc,abk,nab->nfa", mu / z_var, q_gammaeta, mu, q_gammaeta)
        E2_4 = -0.5 * T.sum(corr * (1 - T.eye(F)), (1, 2))
    else:
        E2_4 = 0.0
    E3 = (q_gamma * logpi).sum((1, 2))
    if mode == "bernoulli":
        E4 = (q_eta * logpia + (1 - q_eta) * T.log(1 - pia + eps)).sum(1)
    else:
        E4 = (q_eta * logpia).sum(1)

    # now on to the entropy
    H = (
        K * (log2pi + 1) / 2
        + 0.5 * q_logvar.sum(1)
        - (q_gamma * q_loggamma).sum((1, 2))
    )
    if mode == "bernoulli":
        Ha = -(q_eta * q_logeta + (1 - q_eta) * T.log(1 - q_eta + eps)).sum(1)
    else:
        Ha = -(q_eta * q_logeta).sum(1)
    return -(E1 + E2_1 + E2_2 + E2_3 + E2_4 + E3 + E4 + H + Ha)


[docs]def sparse_softmax_crossentropy_logits(p, q):
    """Cross entropy loss given that :math:`p` is sparse and
    :math:`q` is the log-probability.

    The formal definition given that :math:`p` is now an
    index (of the Dirac) s.a. :math:`p\in \{1,\dots,D\}`
    and :math:`q` is unormalized (log-proba)
    is given by (for discrete variables, p sparse)

    .. math::
        \mathcal{L}(p,q)=-q_{p}+\log(\sum_{d=1}^D \exp(q_d))
    .. math::
        \mathcal{L}(p,q)=-q_{p}+LogSumExp(q)
    .. math::
        \mathcal{L}(p,q)=-q_{p}+LogSumExp(q-\max_{d}q_d)

    or by (non p sparse)

    .. math::
        \mathcal{L}(p,q)=-\sum_{d=1}^Dp_{d}q_{d}+\log(\sum_{d=1}^D \exp(q_d))
    .. math::
        \mathcal{L}(p,q)=-\sum_{d=1}^Dp_{d}q_{d}+LogSumExp(q)
    .. math::
        \mathcal{L}(p,q)=-\sum_{d=1}^Dp_{d}q_{d}+LogSumExp(q-\max_{d}q_d)


    with :math:`p` the class index and :math:`q` the predicted one
    (output of the network). This class takes two non sparse
    vectors which should be nonnegative and sum to one.
    """
    # the linear part of the loss
    # one = T.equal(p[:, None], T.arange(q.shape[1])).astype("float32")
    # return -(one * log_softmax(q)).sum(1)
    return -T.take_along_axis(log_softmax(q), p[:, None], 1).squeeze(1)


[docs]def softmax_crossentropy_logits(p, q):
    """see sparse cross entropy"""
    return -(p * log_softmax(q)).sum(-1)


def sparse_sigmoid_crossentropy_logits(labels, logits):
    return -logits * labels + softplus(logits)


[docs]def hinge_loss(predictions, targets, delta=1):
    """(binary) hinge loss.


    For an intended output :math:`t = ±1` and a classifier score :math:`p`, the hinge loss is defined for each datum as

    .. math::
        \max ( 0 , Δ − t  p)

    as soon as the loss is smaller than :math:`Δ` the datum is well classified, however margin is increased by pushing the loss to :math:`0` hence :math:`Δ` is the user-defined prefered margin to reach. In standard SVM :math:`Δ=1`
    leading to

    .. plot::

      import matplotlib.pyplot as plt
      import numpy as np
      x = np.linspace(-3, 3, 300)
      y = 1
      hinge1 = np.maximum(0, 1-x*y)
      hinge02 = np.maximum(0, 0.2-x*y)
      plt.plot(x, hinge1)
      plt.plot(x, hinge02)
      plt.plot(x, (x < 0).astype('int32'), '--k')
      plt.axvline(1,c='r')
      plt.xlabel(r'predictions')
      plt.ylabel(r'hinge$(p,1,\Delta)$')
      plt.legend([r'hinge loss $\Delta=1$',r'hinge loss $\Delta=0.2$' '0-1 loss', 'true label'])
      plt.tight_layout()
      plt.show()


    Note that :math:`p` should be the "raw" output of the classifier's decision function, not the predicted class label. For instance, in linear SVMs, :math:`p = <w, x> + b` where ( :math:`w , b` are the parameters of the hyperplane and :math:`x` is the input variable(s).

    Parameters
    ----------
    predictions : 1D tensor
        prediction of the classifier (raw,)
    targets : 1D binary tensor with values in :math:`t\in\{-1,1\}`.

    Returns
    -------
    1D tensor
        An expression for the item-wise hinge loss

    Notes
    -----
    This is an alternative to the categorical cross-entropy loss for
    classification problems
    """
    return T.maximum(0, delta - predictions * targets)


[docs]def multiclass_hinge_loss(predictions, targets, delta=1):
    """multi-class hinge loss.

    .. math::
        L_i = \max_{j ≠ t_i} (0, p_j - p_{t_i} + Δ)
    Parameters
    ----------
    predictions : 2D tensor
        Predictions in (0, 1), such as softmax output of a neural network,
        with data points in rows and class probabilities in columns.
    targets : Theano 2D tensor or 1D tensor
        Either a vector of int giving the correct class index per data point
        or a 2D tensor of one-hot encoding of the correct class in the same
        layout as predictions (non-binary targets in [0, 1] do not work!)
    delta : scalar, default 1
        The hinge loss margin
    Returns
    -------
    Theano 1D tensor
        An expression for the item-wise multi-class hinge loss
    Notes
    -----
    This is an alternative to the categorical cross-entropy loss for
    multi-class classification problems
    """
    num_cls = predictions.shape[1]
    if targets.ndim == predictions.ndim - 1:
        targets = T.to_one_hot(targets, num_cls)
    elif targets.ndim != predictions.ndim:
        raise TypeError("rank mismatch between targets and predictions")
    corrects = predictions[targets.nonzero()]
    rest = predictions[(1 - targets).nonzero()].reshape((-1, num_cls - 1))
    rest = rest.max(axis=1)
    return T.activations.relu(rest - corrects + delta)


[docs]def squared_differences(x, y):
    """elementwise squared differences.

    Computes element-wise

    .. math::

        (x-y)^2

    broadcasting applies as in any
    operations.

    `Wikipedia <https://en.wikipedia.org/wiki/Loss_function#Quadratic_loss_function>`_

    Parameters
    ----------

    x: tensor-like

    y: tensor-like

    Returns
    -------

    tensor-like

    """

    return (x - y) ** 2


[docs]def accuracy(targets, predictions):
    """classification accuracy.

    It is computed by averaging the `0-1 loss <https://en.wikipedia.org/wiki/Loss_function#0-1_loss_function>`_
    as in

    .. math::
        (Σ_{n=1}^N 1_{\{y_n == p_n\}})/N

    where :math:`p` denotes the predictions. The inputs must be vectors but in
    the special case where targets is a vector
    but predictions is a matrix, then the
    argmax is used to get the real predictions as in

    .. math::
        (Σ_{n=1}^N 1_{\{y_n == arg \max p_{n,:}\}})/N

    `Wikipedia <https://en.wikipedia.org/wiki/Accuracy_and_precision>`_

    Parameters
    ----------

    targets: 1D tensor-like

    predictions: tensor-like
        it can be a :math:`2D` matrix in which case the ``argmax`` is used to
        get the prediction

    Returns
    -------

    tensor-like
    """

    if not hasattr(targets, "ndim"):
        targets = T.array(targets)

    if targets.ndim != 1:
        raise RuntimeError("targets should be of rank 1, given rank is {targets.ndim}")

    if predictions.ndim == 2:
        accu = T.cast(T.equal(targets, predictions.argmax(1)), "float32")
    elif predictions.ndim == 1:
        accu = T.cast(T.equal(targets, predictions), "float32")
    else:
        raise RuntimeError(
            "predictions should be of rank 1 or 2, given rank is {predictions.ndim}"
        )
    return accu.mean()


def _assign(cluster, pred, true):
    c_labels = T.where(T.equal(pred, cluster)[:, None], true, 0)
    per_cluster_counts = c_labels.sum(0)
    assignment = per_cluster_counts.argmax()
    return assignment


[docs]def clustering_accuracy(labels, predictions, n_clusters):
    """
    find accuracy of clustering based on intra cluster labels

    This accuracy allows to quantify the ability of a clustering algorithm to solve the clustering task
    given the true labels of the data.
    This functions finds for each predicted cluster
    what is the most present label and uses it as the
    cluster label. Based on those cluster labels the
    accuracy is then computed.

    Args:

    labels: 1d integer Tensor
        the true labels of the data

    predictions: 1d integer Tensor
        the predicted data clusters

    n_clusters: int
        the number of clusters

    """
    one_hot_labels = T.one_hot(labels, n_clusters)
    cluster_assignment = T.map(
        _assign,
        sequences=[T.range(n_clusters, dtype="int32")],
        non_sequences=[predictions, one_hot_labels],
    )
    return accuracy(labels, cluster_assignment[predictions])