# If the import fails, uncomment the following line:
# !pip install transformers
import torch, os
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import pandas as pd

# Avoid a warning message
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def check_global_vars(func, allowed_globals):
    import inspect
    used_globals = set(inspect.getclosurevars(func).globals.keys())
    disallowed_globals = used_globals - set(allowed_globals)
    if len(disallowed_globals) > 0:
        raise AssertionError(f"The function {func.__name__} used unexpected global variables: {list(disallowed_globals)}")

model_name = "openai-community/gpt2"

# Other models you could try:
# model_name = "EleutherAI/pythia-1.4b-deduped"
# model_name = "google/gemma-3-4b"
# model_name = "google/gemma-3-4b-it"
# Note: you'll need to accept the license agreement on https://huggingface.co/google/gemma-7b to use Gemma models

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to("cpu")
streamer = TextStreamer(tokenizer)

# Add the EOS token as PAD token to avoid warnings
if model.generation_config.pad_token_id is None:
    model.generation_config.pad_token_id = model.generation_config.eos_token_id
# Silence a warning.
tokenizer.decode([tokenizer.eos_token_id]);
print("Loaded on CPU.")

Loading tokenizer...
Loading model...
Loaded on CPU.

print(f"The tokenizer has {len(tokenizer.get_vocab())} strings in its vocabulary.")
print(f"The model has {model.num_parameters():,d} parameters.")

The tokenizer has 50257 strings in its vocabulary.
The model has 124,439,808 parameters.

phrase = "This weekend I plan to"
# Another one to try later. This was a famous early example of the GPT-2 model:
# phrase = "In a shocking finding, scientists discovered a herd of unicorns living in"

batch = tokenizer(ph..., return_tensors='pt')
input_ids = batch['in...']

with torch.no_grad(): # This tells PyTorch we don't need it to compute gradients for us.
    model_output = model(...)
print(f"logits shape: {list(model_output.logits.shape)}")

logits shape: [1, 5, 50257]

last_token_logits = model_output.logits[...]
assert last_token_logits.shape == (len(tokenizer.get_vocab()),)

# compute the probability distribution over the next token
last_token_probabilities = last_token_logits.sof...(dim=-1)
# dim=-1 means to compute the softmax over the last dimension

most_likely_token_id = ...
decoded_token = tokenizer.decode(most_likely_token_id)
probability_of_most_likely_token = last_token_probabilities[...]

print("For the phrase:", phrase)
print(f"Most likely next token: {most_likely_token_id}, which corresponds to {repr(decoded_token)}, with probability {probability_of_most_likely_token:.2%}")

For the phrase: This weekend I plan to
Most likely next token: 467, which corresponds to ' go', with probability 5.79%

most_likely_tokens = last_token_logits.topk(...)
print(f"most likely token index from topk is {most_likely_tokens.indices[0]}") # this should be the same as argmax
decoded_tokens = [tokenizer.decode(...) for ... in most_likely_tokens.indices]
probabilities_of_most_likely_tokens = last_token_probabilities[most_likely_tokens.indices]

# Make a nice table to show the results
most_likely_tokens_df = pd.DataFrame({
    'tokens': decoded_tokens,
    'probabilities': probabilities_of_most_likely_tokens,
})
# Show the table, in a nice formatted way (see https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html#Builtin-Styles)
# Caution: this "gradient" has *nothing* to do with gradient descent! (It's a color gradient.)
most_likely_tokens_df.style.hide(axis='index').background_gradient()

most likely token index from topk is 467

def predict_next_tokens(...):
    # your code here

def show_tokens_df(tokens_df):
    return tokens_df.style.hide(axis='index').background_gradient()

check_global_vars(predict_next_tokens, allowed_globals=["torch", "tokenizer", "pd", "model"])

show_tokens_df(predict_next_tokens("This weekend I plan to", 5))

show_tokens_df(predict_next_tokens("To be or not to", 5))

show_tokens_df(predict_next_tokens("For God so loved the", 5))

text = "Let's stop for lunch because I'm getting hungry"
input_ids = tokenizer(text, return_tensors='pt')['input_ids'].to(model.device)

with torch.no_grad():
    logits = model(input_ids=input_ids).logits

# The final token in the sequence is the target token, and logits at -2 predict it.
actual_last_token_id = input_ids[0, ...]
probs_before_last_token = logits[0, -2].softmax(dim=-1)

prob_of_actual_last_token = probs_before_last_token[...]
last_token_loss = -torch.log(...)
actual_last_token_int = int(actual_last_token_id.detach().cpu())

print("Text:", text)
print("Actual last token:", repr(tokenizer.decode([actual_last_token_int])))
print(f"Token with highest probability: {tokenizer.decode([probs_before_last_token.argmax()])!r}, with probability {float(probs_before_last_token.max()):.4f}")
print(f"P(actual token | previous context): {float(prob_of_actual_last_token):.4f}")
print(f"Token loss = -log(P): {float(last_token_loss):.4f}")

def token_surprise_table(text):
    input_ids = tokenizer(text, return_tensors='pt')['input_ids'].to(model.device)
    with torch.no_grad():
        logits = model(input_ids=input_ids).logits  # shape: (1, seq_len, vocab_size)

    # logits[0, i, :] predicts token at position i + 1
    rows = []
    for i in range(input_ids.shape[1] - 1):
        # Goal: compute the surprise (negative log probability) of the
        # token that actually appears at position i + 1,
        # given the model's predictions at position i.

        probs = logits[0, ...]...
        actual_next_token = input_ids[0, ...]
        token_loss = ...
        actual_next_token_int = int(actual_next_token.detach().cpu())
        rows.append({
            "previous_tokens": tokenizer.decode(input_ids[0, :i+1]),
            "token": tokenizer.decode([actual_next_token_int]),
            "probability": float(probs[actual_next_token]),
            "surprise": float(token_loss),
        })
    return pd.DataFrame(rows)

surprise_df = token_surprise_table(text)
surprise_df

import math

def compute_perplexity(text, model_to_use=None):
    if model_to_use is None:
        model_to_use = model
    input_ids = tokenizer(text, return_tensors='pt')['input_ids'].to(model_to_use.device)
    # Strategy: collect losses in a list, then average them,
    # then exponentiate the average loss to get perplexity.
    # your code here
    return math.exp(...)

check_global_vars(compute_perplexity, allowed_globals=["torch", "tokenizer", "model", "math"])

texts = [
    "The cat sat on the mat.",
    "The cat computed the eigenvalue.",
    "Flurb zazzle moop tink wob.",
]

pd.DataFrame({
    "text": texts,
    "perplexity": [compute_perplexity(t) for t in texts],
}).sort_values("perplexity")

import copy

def quantize_model(original_model, bits):
    """Simulate uniform quantization by rounding parameters to a fixed number of levels."""
    quantized = copy.deepcopy(original_model).cpu()
    with torch.no_grad():
        levels = 2 ** bits - 1
        for param in quantized.parameters():
            pmin = param.min()
            pmax = param.max()
            if torch.isclose(pmax, pmin):
                continue
            scale = (pmax - pmin) / levels
            q = ((param - pmin) / scale).round().clamp(0, levels)
            param.copy_(q * scale + pmin)
    return quantized

quant_test_text = "The cat sat on the mat."
bit_results = []

for bits in [32, 24, 16, 8, 4, 2, 1]:
    q_model = quantize_model(model, bits=bits)
    q_ppl = compute_perplexity(quant_test_text, model_to_use=q_model)
    bit_results.append({"bits": bits, "perplexity": q_ppl})
    print(f"{bits}-bit perplexity: {q_ppl:.2f}")

pd.DataFrame(bit_results)

tokens	probabilities
go	0.057940
take	0.053050
attend	0.038625
visit	0.036411
be	0.027352
do	0.024958
make	0.023818
spend	0.021303
play	0.019172
travel	0.017760

tokens	probabilities
go	0.057940
take	0.053050
attend	0.038625
visit	0.036411
be	0.027352

tokens	probabilities
be	0.964031
become	0.004372
have	0.004315
Be	0.001392
get	0.000955

Bits per parameter	Model size
32 (original `float32`)	fill in model size in familiar units
24	fill in model size in familiar units
16	fill in model size in familiar units
8	fill in model size in familiar units
4	fill in model size in familiar units
2	fill in model size in familiar units
1	fill in model size in familiar units

Logits and Perplexity in Causal Language Models¶

Setup¶

Task¶

Perplexity¶

Analysis¶