# Setup the environment
!pip install --upgrade huggingface_hub transformers tokenizers
#!pip install bitsandbytes
#from huggingface_hub import login
#from kaggle_secrets import UserSecretsClient
#access_token_read = UserSecretsClient().get_secret("HUGGINGFACE_TOKEN")
#login(token = access_token_read)
#!pip install git+https://github.com/huggingface/transformers -U
#!pip install accelerate
#!pip install -i https://pypi.org/simple/ bitsandbytes

import torch, os
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer

# Load the model
if 'model' in globals(): del model
USE_INSTRUCTION_TUNED = False # we'll switch this to True partway through the lab
if USE_INSTRUCTION_TUNED:
    model_name_kaggle = '/kaggle/input/gemma-3/transformers/gemma-3-1b-it/1'
    model_name_hf = 'google/gemma-3-1b-it'
else:
    # Use the base model ("pt" = "pre-trained")
    model_name_kaggle = '/kaggle/input/gemma-3/transformers/gemma-3-1b-pt/1'
    model_name_hf = 'google/gemma-3-1b-pt'

if os.path.exists(model_name_kaggle):
    model_name = model_name_kaggle
else:
    print("Warning: loading model weights from the Internet. This might take a bit of extra time.")
    model_name = model_name_hf

print("Loading the tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Loading the model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    torch_dtype=torch.bfloat16)
streamer = TextStreamer(tokenizer)
# Silence a warning.
tokenizer.decode([tokenizer.eos_token_id]);
print("Loaded.")

# Check where the whole model is loaded and what data type it's using.
model.device, model.dtype

# Check where parameters are loaded. If this is anything other than {'': 0}
# then probably some parts of the model got offloaded onto CPU and so will run slow.
model.hf_device_map

%%time
doc = '''Expression: 2 + 2. Result:'''
#doc = '''The capital of France is'''
tokenized_doc = tokenizer(doc, return_tensors='pt')['input_ids']
with torch.inference_mode():
    model_out = model.generate(
        tokenized_doc.to(model.device),
        max_new_tokens=64,
        do_sample=False,
        streamer=streamer)

assert tokenizer.chat_template is not None, "Switch to the instruction-tuned model for this step."

role = """You are a helpful 2nd-grade teacher. Help a 2nd grader to answer questions in a short and clear manner."""
task = """Explain why the sky is blue"""

messages = [
    {
        "role": "user",
        "content": f"{role}\n\n{task}",
    },
 ]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
print(tokenizer.batch_decode(tokenized_chat)[0])

# Use model.generate to complete this chat.
# your code here

# Gather some documents. For a simple example, we'll use the docstrings of PyTorch functions.
import inspect
docstrings = {}
for name, obj in inspect.getmembers(torch.nn):
    if inspect.isfunction(obj) or inspect.isclass(obj):
        docstrings[name] = inspect.getdoc(obj)

docstrings.keys()

Prompt Engineering¶

Warm-Up¶

Chat Templating¶

Retrieval-Augmented Generation¶