# If the import fails, uncomment the following line:
# !pip install transformers
import torch
from torch import tensor
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
# Avoid a warning message
import os; os.environ["TOKENIZERS_PARALLELISM"] = "false"


def check_global_vars(func, allowed_globals):
    import inspect
    used_globals = set(inspect.getclosurevars(func).globals.keys())
    disallowed_globals = used_globals - set(allowed_globals)
    if len(disallowed_globals) > 0:
        raise AssertionError(f"The function {func.__name__} used unexpected global variables: {list(disallowed_globals)}")


tokenizer = AutoTokenizer.from_pretrained("distilgpt2", add_prefix_space=True) # smaller version of GPT-2
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("distilgpt2")


print(f"The tokenizer has {len(tokenizer.get_vocab())} strings in its vocabulary.")
print(f"The model has {model.num_parameters():,d} parameters.")

The tokenizer has 50257 strings in its vocabulary.
The model has 81,912,576 parameters.


phrase = "This weekend I plan to"
# Another one to try later. This was a famous early example of the GPT-2 model:
# phrase = "In a shocking finding, scientists discovered a herd of unicorns living in"


batch = tokenizer(ph..., return_tensors='pt')
input_ids = batch['in...']


with torch.no_grad(): # This tells PyTorch we don't need it to compute gradients for us.
    model_output = model(...)
print(f"logits shape: {list(model_output.lo...)}")

logits shape: [1, 5, 50257]


last_token_logits = model_output.logits[...]
assert last_token_logits.shape == (len(tokenizer.get_vocab()),)


# compute the probability distribution over the next token
last_token_probabilities = last_token_logits.sof...(dim=-1)
# dim=-1 means to compute the softmax over the last dimension


most_likely_token_id = ...
decoded_token = tokenizer.decode(most_likely_token_id)
probability_of_most_likely_token = last_token_probabilities[...]

print("For the phrase:", phrase)
print(f"Most likely next token: {most_likely_token_id}, which corresponds to {repr(decoded_token)}, with probability {probability_of_most_likely_token:.2%}")

For the phrase: This weekend I plan to
Most likely next token: 467, which corresponds to ' go', with probability 5.98%


most_likely_tokens = last_token_logits.topk(...)
print(f"most likely token index from topk is {most_likely_tokens.indices[0]}") # this should be the same as argmax
decoded_tokens = [tokenizer.decode(...) for ... in most_likely_tokens.indices]
probabilities_of_most_likely_tokens = last_token_probabilities[most_likely_tokens.indices]

# Make a nice table to show the results
most_likely_tokens_df = pd.DataFrame({
    'tokens': decoded_tokens,
    'probabilities': probabilities_of_most_likely_tokens,
})
# Show the table, in a nice formatted way (see https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html#Builtin-Styles)
# Caution: this "gradient" has *nothing* to do with gradient descent! (It's a color gradient.)
most_likely_tokens_df.style.hide_index().background_gradient()

most likely token index from topk is 467

/tmp/ipykernel_743969/2620894099.py:15: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
  most_likely_tokens_df.style.hide_index().background_gradient()


def predict_next_tokens(...):
    # your code here

check_global_vars(predict_next_tokens, allowed_globals=["torch", "tokenizer", "pd", "model"])


predict_next_tokens("This weekend I plan to", 5).style.hide_index().background_gradient()

/tmp/ipykernel_743969/1600815326.py:1: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
  predict_next_tokens("This weekend I plan to", 5).style.hide_index().background_gradient()


predict_next_tokens("To be or not to", 5).style.hide_index().background_gradient()

/tmp/ipykernel_743969/479713111.py:1: FutureWarning: this method is deprecated in favour of `Styler.hide(axis='index')`
  predict_next_tokens("To be or not to", 5).style.hide_index().background_gradient()


predict_next_tokens("For God so loved the", 5).style.hide_index().background_gradient()

tokens	probabilities
go	0.059828
take	0.043880
spend	0.031570
make	0.030519
do	0.029206
be	0.027960
attend	0.025885
visit	0.025827
run	0.022074
have	0.020955

tokens	probabilities
go	0.059828
take	0.043880
spend	0.031570
make	0.030519
do	0.029206

tokens	probabilities
be	0.648473
have	0.021346
the	0.012962
do	0.009471
,	0.007444

Logits in Causal Language Models¶

Setup¶

Task¶

Analysis¶