# Uncomment the following line to install the transformers library
#!pip install -q transformers

import torch
from torch import tensor

# https://huggingface.co/docs/transformers/en/generation_strategies
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, set_seed
model_name = "openai-community/gpt2"
# Here's a few larger models you could try:
# model_name = "EleutherAI/pythia-1.4b-deduped"
# model_name = "google/gemma-2b"
# model_name = "google/gemma-2b-it"
# Note: you'll need to accept the license agreement on https://huggingface.co/google/gemma-7b to use Gemma models
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained(model_name)
if model.generation_config.pad_token_id is None:
    model.generation_config.pad_token_id = model.generation_config.eos_token_id
streamer = TextStreamer(tokenizer)
# Silence a warning.
tokenizer.decode([tokenizer.eos_token_id]);

token_to_id_dict = tokenizer.get_vocab()
print(f"The tokenizer has {len(token_to_id_dict)} strings in its vocabulary.")
print(f"The model has {model.num_parameters():,d} parameters.")

The tokenizer has 50257 strings in its vocabulary.
The model has 124,439,808 parameters.

# warning: this assumes that there are no gaps in the token ids, which happens to be true for this tokenizer.
id_to_token = [token for token, id in sorted(token_to_id_dict.items(), key=lambda x: x[1])]
print(f"The first 10 tokens are: {id_to_token[:10]}")
print(f"The last 10 tokens are: {id_to_token[-10:]}")

The first 10 tokens are: ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*']
The last 10 tokens are: ['Ġ(/', 'âĢ¦."', 'Compar', 'Ġamplification', 'ominated', 'Ġregress', 'ĠCollider', 'Ġinformants', 'Ġgazed', '<|endoftext|>']

set_seed(0)
model.generate(
    **tokenizer("A list of colors: red, blue,", return_tensors="pt"),
    max_new_tokens=10, do_sample=True, temperature=0.3, penalty_alpha=.5, top_k=5, streamer=streamer);

 A list of colors: red, blue, green, blue, yellow, yellow, green,

phrase = "I visited Muskegon"
# Another one to try later. This was a famous early example of the GPT-2 model:
# phrase = "In a shocking finding, scientists discovered a herd of unicorns living in"

tokens = tokenizer.tokenize(phrase)
tokens

['ĠI', 'Ġvisited', 'ĠMus', 'ke', 'gon']

# your code here

' I visited Muskegon'

# for comparison:
''.join(tokens)

'ĠIĠvisitedĠMuskegon'

input_ids = ...
input_ids

[314, 8672, 2629, 365, 14520]

# using convert_ids_to_tokens
# your code here

' I visited Muskegon'

# using tokenizer.decode
# your code here

' I visited Muskegon'

input_ids_batch = tensor([input_ids])
output_ids = model.generate(input_ids_batch, max_new_tokens=20)[0] # the [0] is to get the first example in the batch
output_ids

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.

tensor([  314,  8672,  2629,   365, 14520,    11,   290,   314,   373,  1297,
          326,   262,  1748,   373,   287,   262,  1429,   286,   852,  3170,
           13,   314,   373,  1297,   326])

# your code here

' I visited Muskegon, and I was told that the city was in the process of being built. I was told that'

# your code here

# your code here

Tokenization¶

Setup¶

Download and load the model¶

Demo¶

Task¶

Getting familiar with tokens¶

Applying what you learned¶

Analysis¶