import random
import time
import os
import torch
import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from IPython.display import display, HTML
from PIL import Image
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm

print(f"PyTorch version: {torch.__version__}")
print(f"TorchVision version: {torchvision.__version__}")
num_gpus = torch.accelerator.device_count()
print(f"Accelerators available: {num_gpus}")
if num_gpus == 0:
    display(HTML("No Accelerators available. Training will be slow. <b>Please enable an accelerator.</b>"))

PyTorch version: 2.10.0
TorchVision version: 0.25.0
Accelerators available: 1

device = torch.accelerator.current_accelerator() or torch.device("cpu")
print(f"Using device: {device}")

Using device: mps

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def show_image_grid(images, titles=None, rows=None, cols=3, title_fontsize=8, figsize=(10, 10)):
    """Display a grid of PIL images."""
    if rows is None:
        rows = (len(images) + (cols - 1)) // cols
    fig, axs = plt.subplots(rows, cols, figsize=figsize)
    for ax in axs.flatten(): ax.axis('off')
    for i, ax in enumerate(axs.flatten()):
        if i >= len(images): break
        ax.imshow(np.array(images[i]).astype('uint8'))
        if titles is not None:
            ax.set_title(titles[i], fontsize=title_fontsize)

VALIDATION_FRAC = 0.2

class config:
    seed = 123
    learning_rate = 1e-3
    epochs = 1
    batch_size = 16
    image_size = 256
    pretrained_weights = models.EfficientNet_B0_Weights.IMAGENET1K_V1
    freeze_backbone = False

set_seed(config.seed)

import urllib.request
import tarfile

url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
download_dir = Path("./data")
download_dir.mkdir(parents=True, exist_ok=True)
archive_path = download_dir / "flower_photos.tgz"
extract_path = download_dir / "flower_photos"

if not archive_path.exists():
    print(f"Downloading {url} to {archive_path}...")
    urllib.request.urlretrieve(url, archive_path)
    print("Download complete.")

if not extract_path.exists():
    print(f"Extracting {archive_path} to {extract_path}...")
    with tarfile.open(archive_path, "r:gz") as tar:
        tar.extractall(path=download_dir)
    print("Extraction complete.")

data_path = extract_path
print(f"Data path set to: {data_path}")

Data path set to: data/flower_photos

data_transforms = config.pretrained_weights.transforms(crop_size=config.image_size)

full_dataset = datasets.ImageFolder(root=data_path, transform=data_transforms)
class_names = full_dataset.classes

val_size = int(VALIDATION_FRAC * len(full_dataset))
train_size = len(full_dataset) - val_size
train_dataset, val_dataset = torch.utils.data.random_split(
    full_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(config.seed)
)

num_dataloader_workers = os.cpu_count() // 2 if os.cpu_count() else 0

train_dataloader = DataLoader(
    train_dataset, batch_size=config.batch_size, shuffle=True,
    num_workers=num_dataloader_workers,
    multiprocessing_context='fork' if num_dataloader_workers > 0 else None
)
val_dataloader = DataLoader(
    val_dataset, batch_size=config.batch_size, shuffle=False,
    num_workers=num_dataloader_workers,
    multiprocessing_context='fork' if num_dataloader_workers > 0 else None
)

print(f"Classes: {class_names}")
print(f"Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")

Classes: ['daisy', 'dandelion', 'roses', 'sunflowers', 'tulips']
Training samples: 2936, Validation samples: 734

def get_val_image(idx):
    """Load the original (un-transformed) image for a validation set index."""
    dataset_idx = val_dataset.indices[idx]
    path, _ = full_dataset.samples[dataset_idx]
    return Image.open(path).convert('RGB').resize((config.image_size, config.image_size))

def get_val_images(indices):
    """Load original images for a list of validation set indices."""
    return [get_val_image(idx) for idx in indices]

model = models.efficientnet_b0(weights=config.pretrained_weights)
if config.freeze_backbone:
    # Freeze all layers except the classifier
    for param in model.parameters():
        param.requires_grad = False
num_features = model.classifier[-1].in_features
model.classifier[-1] = nn.Linear(num_features, len(class_names))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

model.train()
for inputs, labels in tqdm(train_dataloader, desc="Training"):
    inputs, labels = inputs.to(device), labels.to(device)
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

# Quick validation check
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in val_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
print(f"Validation accuracy: {correct / total:.4f}")

Training:   0%|          | 0/184 [00:00<?, ?it/s]

Validation accuracy: 0.8624

for name, child in model.named_children():
    print(f"{name}: {type(child).__name__}")

print()
print("Classifier (head):")
print(model.classifier)

features: Sequential
avgpool: AdaptiveAvgPool2d
classifier: Sequential

Classifier (head):
Sequential(
  (0): Dropout(p=0.2, inplace=True)
  (1): Linear(in_features=1280, out_features=5, bias=True)
)

import copy
feature_extractor = copy.deepcopy(model)
del feature_extractor.classifier[-1]
feature_extractor = feature_extractor.eval()

print("Feature extractor classifier (no more Linear layer):")
print(feature_extractor.classifier)

Feature extractor classifier (no more Linear layer):
Sequential(
  (0): Dropout(p=0.2, inplace=True)
)

sample_images, sample_labels = next(iter(val_dataloader))
with torch.no_grad():
    sample_features = feature_extractor(sample_images.to(device))
print("Input shape:", sample_images.shape)
print("Output shape:", sample_features.shape)

Input shape: torch.Size([16, 3, 256, 256])
Output shape: torch.Size([16, 1280])

all_features = []
all_labels = []
with torch.no_grad():
    for images, labels in tqdm(val_dataloader, desc="Extracting embeddings"):
        features = feature_extractor(images.to(device))
        all_features.append(features.cpu())
        all_labels.append(labels)

val_features = torch.cat(all_features).numpy()
val_labels = torch.cat(all_labels).numpy()

print("Embeddings shape:", val_features.shape)
print("Labels shape:", val_labels.shape)

Extracting embeddings:   0%|          | 0/46 [00:00<?, ?it/s]

Embeddings shape: (734, 1280)
Labels shape: (734,)

sunflower_indices = [idx for idx, label in enumerate(val_labels) if label == class_names.index('sunflowers')]
tulip_indices = [idx for idx, label in enumerate(val_labels) if label == class_names.index('tulips')]

example_indices = [sunflower_indices[0], sunflower_indices[2], tulip_indices[0]]
show_image_grid(
    get_val_images(example_indices),
    titles=['sunflower A', 'sunflower B', 'tulip C'])

vec_a, vec_b, vec_c = val_features[example_indices]

print(f"sunflower A · sunflower B = {vec_a @ vec_b:.2f}")
print(f"sunflower A · tulip C     = {vec_a @ vec_c:.2f}")
print(f"sunflower B · tulip C     = {vec_b @ vec_c:.2f}")

sunflower A · sunflower B = 17.95
sunflower A · tulip C     = 6.81
sunflower B · tulip C     = -15.59

def normalize(vectors):
    """Normalize each row to unit length."""
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / norms

example_normed = normalize(val_features[example_indices])
cosine_sim = example_normed @ example_normed.T

pd.DataFrame(
    cosine_sim,
    index=['sunflower A', 'sunflower B', 'tulip C'],
    columns=['sunflower A', 'sunflower B', 'tulip C']
).round(3)

query_idx = example_indices[0]
query_vec = val_features[query_idx]

# Compute cosine similarity of the query against all validation embeddings
val_normed = normalize(val_features)
query_normed = query_vec / np.linalg.norm(query_vec)
similarities = val_normed @ query_normed

most_similar = np.argsort(similarities)[::-1]

print("Most similar (by embedding):")
show_image_grid(get_val_images(most_similar[:9]))

Most similar (by embedding):

# your code here

Least similar (by embedding):

# Collect raw (transformed) images as flat pixel vectors
raw_pixels = []
for images, _ in val_dataloader:
    raw_pixels.append(images.reshape(images.size(0), -1))
raw_pixels = torch.cat(raw_pixels).numpy()

print("Raw pixel vectors shape:", raw_pixels.shape)
print(f"Each image is now a vector of {raw_pixels.shape[1]} numbers")

Raw pixel vectors shape: (734, 196608)
Each image is now a vector of 196608 numbers

# Similarity in pixel space
query_pixels = raw_pixels[query_idx]
pixel_similarities = raw_pixels @ query_pixels

most_similar_pixels = np.argsort(pixel_similarities)[::-1]

print("Most similar (by raw pixels):")
show_image_grid(get_val_images(most_similar_pixels[:9]))

Most similar (by raw pixels):

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

# Get consistent colors for each class
tab10 = plt.cm.tab10
class_colors = [tab10(i) for i in range(len(class_names))]

# Fit multiple projections on the raw (un-normalized) embeddings,
# since the classifier uses dot products, not cosine similarity.
pca = PCA(n_components=6).fit(val_features)
lda = LinearDiscriminantAnalysis(n_components=2).fit(val_features, val_labels)

projections = {
    'PCA dims 1–2': pca.transform(val_features)[:, :2],
    'PCA dims 3–4': pca.transform(val_features)[:, 2:4],
    'PCA dims 5–6': pca.transform(val_features)[:, 4:6],
    'LDA (best class separation)': lda.transform(val_features),
}

fig, axes = plt.subplots(2, 2, figsize=(15, 8))
for ax, (title, proj) in zip(axes.flat, projections.items()):
    for i, name in enumerate(class_names):
        mask = val_labels == i
        ax.scatter(proj[mask, 0], proj[mask, 1], label=name,
                   color=class_colors[i], alpha=0.5, s=10)
    ax.set_title(title)
    ax.set_xticks([])
    ax.set_yticks([])
axes.flat[-1].legend(bbox_to_anchor=(1.05, 1), loc='upper left', markerscale=3)
plt.tight_layout()
plt.show()

# Keep references for later use
projected = projections['LDA (best class separation)']

classifier_weights = model.classifier[-1].weight.detach().cpu().numpy()
classifier_bias = model.classifier[-1].bias.detach().cpu().numpy()

print("Classifier weights shape:", classifier_weights.shape)
print("Classifier bias shape:", classifier_bias.shape)
print(f"\nThere are {classifier_weights.shape[0]} prototypes, one per class, each with {classifier_weights.shape[1]} dimensions.")

Classifier weights shape: (5, 1280)
Classifier bias shape: (5,)

There are 5 prototypes, one per class, each with 1280 dimensions.

rose_idx = ...
rose_prototype = classifier_weights[...]
print(f"Rose prototype shape: {rose_prototype.shape}")

Rose prototype shape: (1280,)

rose_scores = val_features @ rose_prototype
print("Rose scores shape:", rose_scores.shape)

images_by_rosiness = np.argsort(rose_scores)

print("\nMost 'rosy' images:")
show_image_grid(get_val_images(images_by_rosiness[::-1][:9]))

Rose scores shape: (734,)

Most 'rosy' images:

# your code here

Least 'rosy' images:

# Project prototypes using only the LDA rotation, without centering.
# lda.transform() subtracts the data mean first, which is correct for data points
# but wrong for weight vectors (they're directions, not points in data space).
projected_prototypes = classifier_weights @ lda.scalings_[:, :2]
# scale up to be on a similar scale to the projected data points
projected_prototypes *= 5


plt.figure(figsize=(8, 6))
for i, name in enumerate(class_names):
    mask = val_labels == i
    plt.scatter(projected[mask, 0], projected[mask, 1], label=name,
                color=class_colors[i], alpha=0.4, s=10)
    # Show prototype as an arrow from the origin
    plt.annotate('', xy=projected_prototypes[i], xytext=(0, 0),
                 arrowprops=dict(arrowstyle='->', color=class_colors[i], lw=2))
    plt.text(projected_prototypes[i, 0], projected_prototypes[i, 1], f' {name}',
             color=class_colors[i], fontsize=9, fontweight='bold')
plt.scatter(0, 0, color='black', s=30, zorder=10, marker='o')  # mark the origin
plt.legend(markerscale=3)
plt.title("Embeddings + class prototype directions (arrows)")
plt.show()

	sunflower A	sunflower B	tulip C
sunflower A	1.000	0.228	0.063
sunflower B	0.228	1.000	-0.095
tulip C	0.063	-0.095	1.000

Image Embeddings¶

Course Objectives Addressed¶

Setup¶

Load the data¶

Train a model¶

The Body and the Head¶

Extract Embeddings¶

Similarity in Embedding Space¶

Cosine similarity¶

Finding similar images¶

Embeddings vs Raw Pixels¶

Visualizing the Embedding Space¶

Prototypes: How the Classifier Uses Embeddings¶

Wrap-up¶