# Uncomment the following line to install the transformers library
#!pip install -q transformers


import torch
from torch import tensor


from transformers import AutoTokenizer, AutoModelForCausalLM
# We'll use this smaller version of GPT-2
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
# Alternative to add_prefix_space is to use `is_split_into_words=True`
# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)


token_to_id_dict = tokenizer.get_vocab()
print(f"The tokenizer has {len(token_to_id_dict)} strings in its vocabulary.")
print(f"The model has {model.num_parameters():,d} parameters.")

The tokenizer has 50257 strings in its vocabulary.
The model has 81,912,576 parameters.


# warning: this assumes that there are no gaps in the token ids, which happens to be true for this tokenizer.
id_to_token = [token for token, id in sorted(token_to_id_dict.items(), key=lambda x: x[1])]
print(f"The first 10 tokens are: {id_to_token[:10]}")
print(f"The last 10 tokens are: {id_to_token[-10:]}")

The first 10 tokens are: ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*']
The last 10 tokens are: ['Ġ(/', 'âĢ¦."', 'Compar', 'Ġamplification', 'ominated', 'Ġregress', 'ĠCollider', 'Ġinformants', 'Ġgazed', '<|endoftext|>']


phrase = "I visited Muskegon"
# Another one to try later. This was a famous early example of the GPT-2 model:
# phrase = "In a shocking finding, scientists discovered a herd of unicorns living in"


tokens = tokenizer.tokenize(phrase)
tokens

['ĠI', 'Ġvisited', 'ĠMus', 'ke', 'gon']


# your code here

' I visited Muskegon'


input_ids = ...
input_ids

[314, 8672, 2629, 365, 14520]


# using convert_ids_to_tokens
# your code here

' I visited Muskegon'


# using tokenizer.decode
# your code here

' I visited Muskegon'


# your code here

tensor([[  314,  8672,  2629,   365, 14520,    11,   290,   314,   373,  6655,
           284,  1064,   326,   262,  1748,   550,   407,   587,  1498,   284,
          2148,   257,  1774,  1171,  9358,  1080,    13,   198,   198,   198,
           198,   464,  1748,   468,   407,   587,  1498,   284,  2148,   257,
          1774,  1171,  9358,  1080,    13,   198,   464,  1748,   468,   407,
           587,  1498,   284,  2148,   257]])


# your code here

' I visited Muskegon, and I was surprised to find that the city had not been able to provide a proper public transportation system.\n\n\n\nThe city has not been able to provide a proper public transportation system.\nThe city has not been able to provide a'


# your code here

Tokenization¶

Setup¶

Download and load the model¶

Task¶

Getting familiar with tokens¶

Applying what you learned¶

Analysis¶