import torch
import torch.nn.functional as F
from torch import tensor
import ipywidgets as widgets
import matplotlib.pyplot as plt
%matplotlib inline


torch.manual_seed(0)
logits = torch.randn((2, 5))
logits

tensor([[ 1.5410, -0.2934, -2.1788,  0.5684, -1.0845],
        [-1.3986,  0.4033,  0.8380, -0.7193, -0.4033]])


targets = torch.randint(0, 5, size=(2,))
targets

tensor([1, 4])


F.cross_entropy(logits, targets, reduction='none')

tensor([2.3257, 2.0541])


logprobs = logits - logits.logsumexp(axis=1, keepdim=True)


logits.logsumexp(axis=1, keepdim=True)

tensor([[2.0323],
        [1.6507]])


logits.exp().sum(axis=1, keepdim=True).log()

tensor([[2.0323],
        [1.6507]])


# for numerical stability: see e.g., https://gregorygundersen.com/blog/2020/02/09/log-sum-exp/
max_logit = logits.max(axis=1, keepdim=True).values
max_logit + (logits - max_logit).exp().sum(axis=1, keepdim=True).log()

tensor([[2.0323],
        [1.6507]])


targets_1hot = F.one_hot(targets).float()
targets_1hot

tensor([[0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.]])


(logprobs * targets_1hot).sum(axis=1)

tensor([-2.3257, -2.0541])


[logprobs[entry, target] for entry, target in enumerate(targets)]

[tensor(-2.3257), tensor(-2.0541)]


logprobs.gather(1, targets.unsqueeze(1))

tensor([[-2.3257],
        [-2.0541]])