import torch

tokens = ["the", "cat", "chases", "the"]

# Key vectors for each token (from the handout)
keys = torch.tensor([
    [1., 0.],   # "the" (pos 0)
    [3., 1.],   # "cat" (pos 1)
    [0., 3.],   # "chases" (pos 2)
    [2., 0.],   # "the" (pos 3)
])

# Value vectors for each token (from the handout)
values = torch.tensor([
    [0., 0.],   # "the" (pos 0)
    [4., 1.],   # "cat" (pos 1)
    [2., 2.],   # "chases" (pos 2)
    [0., 0.],   # "the" (pos 3)
])

# Query vector for "the" at position 3
query = torch.tensor([2., 3.])

# Compute the attention score (dot product) for each token.
scores = []
for i, token in enumerate(tokens):
    # score = dot product of query with keys[i]
    # your code here

# Normalize: divide each score by the sum of all scores
# your code here

# Compute weighted sum of values
# your code here

# your code here

def compute_attention(query, keys, values, tokens, use_softmax=True):
    """Compute and display attention for a given query."""
    scores = keys @ query
    if use_softmax:
        weights = torch.softmax(scores, dim=0)
    else:
        weights = scores / scores.sum()
    output = weights @ values
    
    print(f"Query: {query.tolist()}")
    for token, s, w in zip(tokens, scores, weights):
        bar = '#' * int(w.item() * 40)
        print(f"  {token:10s}  score={s.item():5.1f}  weight={w.item():.3f}  {bar}")
    print(f"  Output: {output.tolist()}")
    print()

# Original query from the handout
compute_attention(torch.tensor([2., 3.]), keys, values, tokens)

# Discussion question: what if query was [1, 3]?
compute_attention(torch.tensor([1., 3.]), keys, values, tokens)

# Try your own query vectors!
# your code here

# your code here: find a query that gives > 0.9 weight to "cat"

# your code here: find a query that splits attention ~equally between "cat" and "chases"

Self-Attention By Hand (in Code)¶

Setup: Vectors from the Handout¶

Step 1: Compute Attention Scores¶

Step 2: Normalize to Get Attention Weights¶

Step 3: Compute the Output¶

Step 4: Try Softmax Instead¶

Experiment: Change the Query¶

Challenge: Design a Query¶

Scaling Up: From 2D to Real Models¶