Source code for symjax.nn.losses

from .. import tensor as T
import numpy as np
from .. import probabilities
from symjax import nn
from .ops_nn import softmax, log_softmax, softplus


[docs]def huber(targets, predictions, delta=1.0): """huber loss (regression). For each value x in `error=targets-predictions`, the following is calculated: - :math:`0.5 × x^2` if :math:`|x| <= Δ` - :math:`0.5 × Δ^2 + Δ × (|x| - Δ)` if :math:`|x| > Δ` leading to .. plot:: import matplotlib.pyplot as plt import numpy as np x = np.linspace(-3, 3, 300) huber = lambda x, delta: np.where(np.abs(x)<= delta, 0.5*x**2, 0.5*delta**2+delta*(np.abs(x)-delta)) plt.plot(x, huber(x, 0.5)) plt.plot(x, huber(x, 1)) plt.plot(x, huber(x, 2)) plt.plot(x, x**2, '--k') plt.xlabel(r'$x$') plt.ylabel(r'huber$(x,\Delta)$') plt.legend([r'$\Delta=0.5$',r'$\Delta=1$',r'$\Delta=2$', r'$x^2$']) plt.tight_layout() plt.show() `Wikipedia <https://en.wikipedia.org/wiki/Huber_loss>`_ Args: targets: The ground truth output tensor, same dimensions as 'predictions'. predictions: The predicted outputs. delta (Δ): `float`, the point where the huber loss function changes from a quadratic to linear. Returns: loss float, this has the same shape as `targets` """ error = predictions - targets abs_error = T.abs(error) losses = T.where( abs_error <= delta, 0.5 * error ** 2, delta * (abs_error - 0.5 * delta) ) return losses
[docs]def explained_variance(y, ypred, axis=None, epsilon=1e-6): """ Computes fraction of variance that ypred explains about y. The formula is .. math:: 1 - Var[y-ypred] / Var[y] and in the special case of centered targets and predictions it becomes .. math:: 1 - \|y-ypred\|^2_2 / \|y\|_2^2 hence it can be seen as an :math:`ℓ_2' loss rescaled by the energy in the targets. interpretation: - ev=0 => might as well have predicted zero - ev=1 => perfect prediction - ev<0 => worse than just predicting zero Parameters: ----------- y: Tensor like true target ypred: Tensor like prediction axis: integer or None (default=None) the axis along which to compute the var, by default uses all axes epsilon (ϵ): float (default=1e-6) the added constant in the denominator Returns: -------- .. math:: 1 - Var(y-ypred)/(Var(y)+ϵ) Notes: ------ This is not a symmetric function """ return 1 - (y - ypred).var(axis=axis) / (y.var(axis=axis) + epsilon)
[docs]def vae(x, x_hat, q_mean, q_cov, z_mean=None, z_cov=None, x_cov=None): """N samples of dimension D to latent space in K dimension with Gaussian distributions Parameters ---------- x: array should be of shape (N, D) x_hat: array should be of shape (N, D) q_mean: array should be of shape (N, K), infered mean of variational Gaussian q_cov: array should be of shape (N, K), infered log-variance of variational Gaussian z_mean: array should be of shape (K,), mean of z variable z_cov: array should be of shape (K,), logstd of z variable """ if x_cov is None: x_cov = T.ones_like(x_hat) if z_mean is None: z_mean = T.zeros_like(q_mean) if z_cov is None: z_cov = T.ones_like(z_mean) p = probabilities.Normal(x, cov=x_cov) q = probabilities.Normal(q_mean, cov=q_cov) z = probabilities.Normal(z_mean, cov=z_cov) loss = -(p.log_prob(x_hat) - probabilities.KL(z, q)) return loss
[docs]def vae_gmm(x, x_hat, z_mu, z_logvar, mu, logvar, logpi, logvar_x=0.0, eps=1e-8): """N samples of dimension D to latent space of C sluters in K dimension Parameters ---------- x: array should be of shape (N, D) x_hat: array should be of shape (N, D) z_mu: array should be of shape (N, K), infered mean of variational Gaussian z_logvar: array should be of shape (N, K), infered log-variance of variational Gaussian mu: array should be of shape (C, K), parameter (centroids) logvar: array should be of shape (C, K), parameter (logvar of clusters) logpi: array should be of shape (C,), parameter (prior of clusters) :param logvar_x: :param eps: """ var = T.exp(logvar) z_var = T.exp(z_logvar) # predict the log probability of clusters, shape will be (N, C) # and compute compute p(t|z) = p(z|t)p(t)/(\sum_t p(z|t)p(t)) logprob = ( logpi[:, None] - 0.5 * (T.log(2 * np.pi) + logvar) - (z_mu[:, None, :] - mu) ** 2 / (2 * var) ).sum(2) pt_z = nn.softmax(logprob) # E_{q(z,c|x)}[log(p(x|z))] px_z = -0.5 * ((x - x_hat) ** 2 / T.exp(logvar_x) + logvar_x).sum(1) # - E_{q(z,c|x)}[log(q(c|x))] entropy of categorical h_c = -(pt_z * nn.log_softmax(logprob)).sum(1) # - E_{q(z,c|x)}[log(q(z|x))] : entropy of normal h_z = 0.5 * z_logvar.sum(1) # E_{q(z,c|x)}[log(p(z|c)] ll_z = -0.5 * ( pt_z * ( logvar + z_var[:, None, :] / var - 1 + (z_mu[:, None, :] - mu) ** 2 / var ).sum(-1) ).sum(-1) # E_{q(z,c|x)}[log(p(c)] p_c = (pt_z * logpi).sum(1) loss = -(px_z + ll_z + p_c + h_c + h_z) return loss, px_z + p_c, pt_z
[docs]def vae_comp_gmm(x, x_hat, z_mu, z_logvar, mu, logvar, logpi, logvar_x=0.0, eps=1e-8): """N samples of dimension D to latent space of I pieces each of C sluters in K dimension Parameters ---------- x: array should be of shape (N, D) x_hat: array should be of shape (N, D) z_mu: array should be of shape (N, I, K), infered mean of variational Gaussian z_logvar: array should be of shape (N, I, K), infered log-variance of variational Gaussian mu: array should be of shape (I, C, K), parameter (centroids) logvar: array should be of shape (I, C, K), parameter (logvar of clusters) logpi: array should be of shape (I, C), parameter (prior of clusters) :param logvar_x: :param eps: """ var = T.exp(logvar) z_var = T.exp(z_logvar) # predict the log probability of clusters, shape will be (N, I, C) # and compute compute p(t_i|z_i) = p(z_i|t_i)p(t_i)/(\sum_t_i p(z_i|t_i)p(t_i)) logprob = ( logpi[:, :, None] - 0.5 * (T.log(2 * np.pi) + logvar) - (z_mu[:, :, None, :] - mu) ** 2 / (2 * var) ).sum(3) pt_z = softmax(logprob) # E_{q(z,c|x)}[log(p(x|z))] px_z = ((x - x_hat) ** 2).sum(1) # - E_{q(z,c|x)}[log(q(c|x))] entropy of categorical h_c = -(pt_z * log_softmax(logprob)).sum((1, 2)) # - E_{q(z,c|x)}[log(q(z|x))] : entropy of normal h_z = 0.5 * z_logvar.sum((1, 2)) # E_{q(z,c|x)}[log(p(z|c)] ll_z = -0.5 * ( pt_z * ( logvar + z_var[:, :, None, :] / var - 1 + (z_mu[:, :, None, :] - mu) ** 2 / var ).sum(-1) ).sum((1, 2)) # E_{q(z,c|x)}[log(p(c)] p_c = (pt_z * logpi).sum((1, 2)) loss = -(px_z + ll_z + p_c + h_c + h_z) return loss
def FGMM_VAE( x, x_rec, x_logvar, z_logvar, q_mu, q_logvar, mu, q_loggamma, q_logeta, logpi, logpia, mode="bernoulli", eps=1e-5, ): """N samples of dimension D to latent space of dimension K with F factors of C clusters in K dimension Parameters ---------- x: array (observation) should be of shape (N, D) x_rec: array (reconstruction, output of decoder) should be of shape (N, D) x_logvar: array (parameter) should be of shape (D,) z_logvar: array (parameter) should be of shape (K,) q_mu: array should be of shape (N, K), infered mean of variational Gaussian q_logvar: array should be of shape (N, K), infered log-variance of variational Gaussian mu: array (parameter) should be of shape (F, C, K), parameter (centroids) q_loggamma: array should be of shape (N, F, C), parameter (prior of clusters) logpi: array (parameter) should be of shape (F, C), parameter (prior of clusters) logpia: array (parameters) should ve of shape (F,) :param q_logeta: :param mode: :param eps: """ q_var = T.exp(q_logvar) q_gamma = T.exp(q_loggamma) q_eta = T.exp(q_logeta) z_var = T.exp(z_logvar) x_var = T.exp(x_logvar) pi = T.exp(logpi) pia = T.exp(logpia) K = z_var.shape[0] D = x_var.shape[0] F = logpi.shape[0] log2pi = T.log(2 * np.pi) # reconstruction part (first expectation) E1 = -0.5 * (((x - x_rec) ** 2 / x_var).sum(1) + x_logvar.sum() + D * log2pi) E2_1 = -0.5 * (log2pi + z_logvar + (q_var + q_mu ** 2) / z_var).sum(1) E2_2 = T.einsum("nf,nfc,fck,nk->n", q_eta, q_gamma, mu, q_mu / z_var) E2_3 = -0.5 * (T.einsum("nf,nfc,fck->nk", q_eta, q_gamma, mu ** 2) / z_var).sum(1) if mode == "bernoulli": q_gammaeta = T.einsum("nf,nfc->nfc", q_eta, q_gamma) corr = T.einsum("fcd,nfc,abk,nab->nfa", mu / z_var, q_gammaeta, mu, q_gammaeta) E2_4 = -0.5 * T.sum(corr * (1 - T.eye(F)), (1, 2)) else: E2_4 = 0.0 E3 = (q_gamma * logpi).sum((1, 2)) if mode == "bernoulli": E4 = (q_eta * logpia + (1 - q_eta) * T.log(1 - pia + eps)).sum(1) else: E4 = (q_eta * logpia).sum(1) # now on to the entropy H = ( K * (log2pi + 1) / 2 + 0.5 * q_logvar.sum(1) - (q_gamma * q_loggamma).sum((1, 2)) ) if mode == "bernoulli": Ha = -(q_eta * q_logeta + (1 - q_eta) * T.log(1 - q_eta + eps)).sum(1) else: Ha = -(q_eta * q_logeta).sum(1) return -(E1 + E2_1 + E2_2 + E2_3 + E2_4 + E3 + E4 + H + Ha)
[docs]def sparse_softmax_crossentropy_logits(p, q): """Cross entropy loss given that :math:`p` is sparse and :math:`q` is the log-probability. The formal definition given that :math:`p` is now an index (of the Dirac) s.a. :math:`p\in \{1,\dots,D\}` and :math:`q` is unormalized (log-proba) is given by (for discrete variables, p sparse) .. math:: \mathcal{L}(p,q)=-q_{p}+\log(\sum_{d=1}^D \exp(q_d)) .. math:: \mathcal{L}(p,q)=-q_{p}+LogSumExp(q) .. math:: \mathcal{L}(p,q)=-q_{p}+LogSumExp(q-\max_{d}q_d) or by (non p sparse) .. math:: \mathcal{L}(p,q)=-\sum_{d=1}^Dp_{d}q_{d}+\log(\sum_{d=1}^D \exp(q_d)) .. math:: \mathcal{L}(p,q)=-\sum_{d=1}^Dp_{d}q_{d}+LogSumExp(q) .. math:: \mathcal{L}(p,q)=-\sum_{d=1}^Dp_{d}q_{d}+LogSumExp(q-\max_{d}q_d) with :math:`p` the class index and :math:`q` the predicted one (output of the network). This class takes two non sparse vectors which should be nonnegative and sum to one. """ # the linear part of the loss # one = T.equal(p[:, None], T.arange(q.shape[1])).astype("float32") # return -(one * log_softmax(q)).sum(1) return -T.take_along_axis(log_softmax(q), p[:, None], 1).squeeze(1)
[docs]def softmax_crossentropy_logits(p, q): """see sparse cross entropy""" return -(p * log_softmax(q)).sum(-1)
def sparse_sigmoid_crossentropy_logits(labels, logits): return -logits * labels + softplus(logits)
[docs]def hinge_loss(predictions, targets, delta=1): """(binary) hinge loss. For an intended output :math:`t = ±1` and a classifier score :math:`p`, the hinge loss is defined for each datum as .. math:: \max ( 0 , Δ − t p) as soon as the loss is smaller than :math:`Δ` the datum is well classified, however margin is increased by pushing the loss to :math:`0` hence :math:`Δ` is the user-defined prefered margin to reach. In standard SVM :math:`Δ=1` leading to .. plot:: import matplotlib.pyplot as plt import numpy as np x = np.linspace(-3, 3, 300) y = 1 hinge1 = np.maximum(0, 1-x*y) hinge02 = np.maximum(0, 0.2-x*y) plt.plot(x, hinge1) plt.plot(x, hinge02) plt.plot(x, (x < 0).astype('int32'), '--k') plt.axvline(1,c='r') plt.xlabel(r'predictions') plt.ylabel(r'hinge$(p,1,\Delta)$') plt.legend([r'hinge loss $\Delta=1$',r'hinge loss $\Delta=0.2$' '0-1 loss', 'true label']) plt.tight_layout() plt.show() Note that :math:`p` should be the "raw" output of the classifier's decision function, not the predicted class label. For instance, in linear SVMs, :math:`p = <w, x> + b` where ( :math:`w , b` are the parameters of the hyperplane and :math:`x` is the input variable(s). Parameters ---------- predictions : 1D tensor prediction of the classifier (raw,) targets : 1D binary tensor with values in :math:`t\in\{-1,1\}`. Returns ------- 1D tensor An expression for the item-wise hinge loss Notes ----- This is an alternative to the categorical cross-entropy loss for classification problems """ return T.maximum(0, delta - predictions * targets)
[docs]def multiclass_hinge_loss(predictions, targets, delta=1): """multi-class hinge loss. .. math:: L_i = \max_{j ≠ t_i} (0, p_j - p_{t_i} + Δ) Parameters ---------- predictions : 2D tensor Predictions in (0, 1), such as softmax output of a neural network, with data points in rows and class probabilities in columns. targets : Theano 2D tensor or 1D tensor Either a vector of int giving the correct class index per data point or a 2D tensor of one-hot encoding of the correct class in the same layout as predictions (non-binary targets in [0, 1] do not work!) delta : scalar, default 1 The hinge loss margin Returns ------- Theano 1D tensor An expression for the item-wise multi-class hinge loss Notes ----- This is an alternative to the categorical cross-entropy loss for multi-class classification problems """ num_cls = predictions.shape[1] if targets.ndim == predictions.ndim - 1: targets = T.to_one_hot(targets, num_cls) elif targets.ndim != predictions.ndim: raise TypeError("rank mismatch between targets and predictions") corrects = predictions[targets.nonzero()] rest = predictions[(1 - targets).nonzero()].reshape((-1, num_cls - 1)) rest = rest.max(axis=1) return T.activations.relu(rest - corrects + delta)
[docs]def squared_differences(x, y): """elementwise squared differences. Computes element-wise .. math:: (x-y)^2 broadcasting applies as in any operations. `Wikipedia <https://en.wikipedia.org/wiki/Loss_function#Quadratic_loss_function>`_ Parameters ---------- x: tensor-like y: tensor-like Returns ------- tensor-like """ return (x - y) ** 2
[docs]def accuracy(targets, predictions): """classification accuracy. It is computed by averaging the `0-1 loss <https://en.wikipedia.org/wiki/Loss_function#0-1_loss_function>`_ as in .. math:: (Σ_{n=1}^N 1_{\{y_n == p_n\}})/N where :math:`p` denotes the predictions. The inputs must be vectors but in the special case where targets is a vector but predictions is a matrix, then the argmax is used to get the real predictions as in .. math:: (Σ_{n=1}^N 1_{\{y_n == arg \max p_{n,:}\}})/N `Wikipedia <https://en.wikipedia.org/wiki/Accuracy_and_precision>`_ Parameters ---------- targets: 1D tensor-like predictions: tensor-like it can be a :math:`2D` matrix in which case the ``argmax`` is used to get the prediction Returns ------- tensor-like """ if not hasattr(targets, "ndim"): targets = T.array(targets) if targets.ndim != 1: raise RuntimeError("targets should be of rank 1, given rank is {targets.ndim}") if predictions.ndim == 2: accu = T.cast(T.equal(targets, predictions.argmax(1)), "float32") elif predictions.ndim == 1: accu = T.cast(T.equal(targets, predictions), "float32") else: raise RuntimeError( "predictions should be of rank 1 or 2, given rank is {predictions.ndim}" ) return accu.mean()
def _assign(cluster, pred, true): c_labels = T.where(T.equal(pred, cluster)[:, None], true, 0) per_cluster_counts = c_labels.sum(0) assignment = per_cluster_counts.argmax() return assignment
[docs]def clustering_accuracy(labels, predictions, n_clusters): """ find accuracy of clustering based on intra cluster labels This accuracy allows to quantify the ability of a clustering algorithm to solve the clustering task given the true labels of the data. This functions finds for each predicted cluster what is the most present label and uses it as the cluster label. Based on those cluster labels the accuracy is then computed. Args: labels: 1d integer Tensor the true labels of the data predictions: 1d integer Tensor the predicted data clusters n_clusters: int the number of clusters """ one_hot_labels = T.one_hot(labels, n_clusters) cluster_assignment = T.map( _assign, sequences=[T.range(n_clusters, dtype="int32")], non_sequences=[predictions, one_hot_labels], ) return accuracy(labels, cluster_assignment[predictions])