import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

def num_parameters(model):
    """Count the number of trainable parameters in a model"""
    return sum(param.numel() for param in model.parameters() if param.requires_grad)


sentence = "Transformed."


sentence_tensor = torch.tensor([ord(x) for x in sentence])
sentence_tensor

tensor([ 84, 114,  97, 110, 115, 102, 111, 114, 109, 101, 100,  46])


def decode(x):
    return ''.join(chr(x) for x in x.numpy())
decode(sentence_tensor)

'Transformed.'


targets = sentence_tensor[1:]
input_ids = sentence_tensor[:-1]
assert input_ids.shape == targets.shape


n_vocab = 256
emb_dim = 5
embedder = nn.Embedding(n_vocab, emb_dim)
embedder.weight.shape

torch.Size([256, 5])


num_parameters(embedder)

1280


class BareBonesLM(nn.Module):
    def __init__(self, n_vocab, emb_dim, tie_weights=True, bias=False):
        super().__init__()
        self.word_to_embedding = nn.Embedding(n_vocab, emb_dim)
        self.lm_head = nn.Linear(emb_dim, n_vocab, bias=bias)

        if tie_weights:
            assert self.lm_head.weight.shape == self.word_to_embedding.weight.shape
            self.lm_head.weight = self.word_to_embedding.weight
    
    def forward(self, input_ids):
        input_embeds = self.word_to_embedding(input_ids)
        x = input_embeds # model would go here
        logits = self.lm_head(x)
        return logits


num_parameters(BareBonesLM(n_vocab=n_vocab, emb_dim=emb_dim))

1280


class MLP(nn.Module):
    def __init__(self, emb_dim, n_hidden):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(in_features=emb_dim, out_features=...),
            nn.ReLU(), # or nn.GELU() or others
            nn.Linear(in_features=..., out_features=emb_dim)
        )

    def forward(self, x):
        return self.model(x)

num_parameters(MLP(emb_dim=emb_dim, n_hidden=16))

181


class FeedForwardLM(nn.Module):
    def __init__(self, n_vocab, emb_dim, n_hidden, tie_weights=True):
        super().__init__()
        self.word_to_embedding = nn.Embedding(n_vocab, emb_dim)
        self.model = MLP(emb_dim=emb_dim, n_hidden=n_hidden)
        self.lm_head = nn.Linear(emb_dim, n_vocab, bias=False)

        if tie_weights:
            assert self.lm_head.weight.shape == self.word_to_embedding.weight.shape
            self.lm_head.weight = self.word_to_embedding.weight
    
    def forward(self, input_ids):
        input_embeds = self.word_to_embedding(input_ids)
        x = self.model(input_embeds)
        return self.lm_head(x)


ff_lm = FeedForwardLM(n_vocab=n_vocab, emb_dim=emb_dim, n_hidden=16)
num_parameters(ff_lm)

1461


example_embeddings = embedder(input_ids)
example_embeddings.shape

torch.Size([11, 5])


# In the real model, this will be defined in the __init__ method
# Note: some models include biases here, others don't.
get_query = nn.Linear(emb_dim, emb_dim, bias=False)
get_key = nn.Linear(emb_dim, emb_dim, bias=False)
get_value = nn.Linear(emb_dim, emb_dim, bias=False)


queries = get_query(example_embeddings)
keys = get_key(example_embeddings)
values = get_value(example_embeddings)


query_2_key_1 = queries[2] @ keys[1]


query_key_products = queries @ keys.T
query_key_products.shape

torch.Size([11, 11])


# Check that it's computing the same thing.
assert torch.allclose(query_2_key_1, query_key_products[2, 1])


# center the color scale on 0.
plt.imshow(query_key_products.detach().numpy(), cmap="RdBu", vmin=-2, vmax=2)
plt.title("Query-key dot products")
plt.xlabel("Key token index")
plt.ylabel("Query token index")
plt.colorbar();


attention_weights = F.softmax(query_key_products / emb_dim**0.5, dim=-1)


plt.imshow(attention_weights.detach().numpy(), cmap="cividis", vmin=0)
plt.title("Attention weights")
plt.xlabel("Key token index")
plt.ylabel("Query token index")
plt.colorbar();


first_token_output = attention_weights[0] @ values
first_token_output.shape # notice that this is the same shape as the input embeddings

torch.Size([5])


head_output = attention_weights @ values
assert torch.allclose(first_token_output, head_output[0])


sequence_len = input_ids.shape[0]
causal_mask = (1 - torch.tril(torch.ones(sequence_len, sequence_len))) * -1e9
plt.imshow(causal_mask.detach().numpy(), cmap="cividis", vmin=-1e9)

<matplotlib.image.AxesImage at 0x11f2804f0>


attention_weights = F.softmax((query_key_products + causal_mask) / emb_dim**0.5, dim=-1)
plt.imshow(attention_weights.detach().numpy(), cmap="cividis", vmin=0)
plt.title("Attention weights")
plt.xlabel("Key token index")
plt.ylabel("Query token index")
plt.colorbar();


class BareBonesAttentionHead(nn.Module):
    '''Implements *single-head* attention, no masking, no dropout, no init'''
    def __init__(self, emb_dim, head_dim):
        super().__init__()
        self.head_dim = head_dim
        self.get_query = nn.Linear(emb_dim, head_dim, bias=False)
        self.get_key = nn.Linear(emb_dim, head_dim, bias=False)
        self.get_value = nn.Linear(emb_dim, head_dim, bias=False)

    def forward(self, x):
        seq_len, emb_dim = x.shape

        # Compute query, key, and value vectors.
        q = self.get_query(x) # (seq_len, head_dim)
        k = self.get_key(x)
        v = self.get_value(x)

        # Compute attention weights
        k_transpose = k.transpose(-2, -1)
        assert k_transpose.shape == (self.head_dim, seq_len)
        scores = q @ k_transpose
        assert scores.shape == (seq_len, seq_len)
        attention_weights = scores.softmax(dim=-1)

        # Compute weighted sum of values.
        out = attention_weights @ v
        return out

class BareBonesSelfAttention(nn.Module):
    def __init__(self, emb_dim, head_dim, n_heads):
        super().__init__()
        self.heads = nn.ModuleList([
            BareBonesAttentionHead(emb_dim, head_dim)
            for i in range(n_heads)
        ]) # we need a ModuleList to make sure the heads are counted as children.
        self.to_output = nn.Linear(n_heads * head_dim, emb_dim, bias=False)

    def forward(self, x):
        head_outputs = [head(x) for head in self.heads]
        concats = torch.cat(head_outputs, dim=-1)
        out = self.to_output(concats)
        assert out.shape == x.shape
        return out

input_embeds = embedder(sentence_tensor)
self_attn = BareBonesSelfAttention(emb_dim, head_dim=256, n_heads=2)
self_attn(input_embeds).shape

torch.Size([12, 5])


class BareBonesTransformerLayer(nn.Module):
    '''Implements bare-bones self-attention transformer layer, no residual connections, no dropout'''
    def __init__(self, emb_dim, head_dim, n_heads, dim_feedforward):
        super().__init__()
        self.self_attention = BareBonesSelfAttention(emb_dim, head_dim=head_dim, n_heads=n_heads)
        self.mlp = MLP(emb_dim, n_hidden=dim_feedforward)
        self.norm_after_attn = nn.LayerNorm(emb_dim)
        self.norm_after_mlp = nn.LayerNorm(emb_dim)
    
    def forward(self, x):
        x = self.self_attention(x)
        x = self.norm_after_attn(x)
        x = self.mlp(x)
        x = self.norm_after_mlp(x)
        return x

xformer_layer = BareBonesTransformerLayer(emb_dim, dim_feedforward=emb_dim, head_dim=256, n_heads=2)
xformer_layer(input_embeds).shape

torch.Size([12, 5])


class TransformerLM(nn.Module):
    def __init__(self, n_vocab, max_len, emb_dim, n_hidden, head_dim=256, n_heads=2):
        super().__init__()
        self.word_to_embedding = nn.Embedding(n_vocab, emb_dim)
        self.pos_to_embedding = nn.Embedding(max_len, emb_dim)
        self.model = BareBonesTransformerLayer(
            emb_dim=emb_dim, dim_feedforward=n_hidden,
            head_dim=head_dim, n_heads=n_heads)
        self.lm_head = nn.Linear(emb_dim, n_vocab, bias=False)

        assert self.lm_head.weight.shape == self.word_to_embedding.weight.shape
        self.lm_head.weight = self.word_to_embedding.weight
    
    def forward(self, input_ids):
        input_embeds = self.word_to_embedding(input_ids)
        # Compute position embeddings.
        position_ids = torch.arange(input_ids.shape[-1])
        pos_embeds = self.pos_to_embedding(position_ids)
        x = input_embeds + pos_embeds
        x = self.model(x)
        return self.lm_head(x)


xformer_lm = TransformerLM(n_vocab=n_vocab, max_len=50, emb_dim=emb_dim, n_hidden=16)
num_parameters(xformer_lm)

11971

Implementing self-attention¶

Setup¶

Getting started¶

Embeddings¶

Complete but vacuous model¶

Multi-Layer Perceptron¶

Complete Language Model with MLP¶

Self-Attention¶