import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import scipy.optimize

# This utility function (inspired by https://github.com/HIPS/autograd/blob/master/autograd/misc/flatten.py)
# is useful for optimizing using scipy.optimize.minimize.
def flatten(arrs):
    """Return a 1D array containing the elements of the input arrays,
    and an unflatten function that takes a flattened array and returns
    the original arrays.
    """
    arrs = [np.asarray(arr) for arr in arrs]
    shapes = [arr.shape for arr in arrs]
    flat = np.concatenate([arr.ravel() for arr in arrs])
    start_indices = np.cumsum([0] + [arr.size for arr in arrs])
    def unflatten(params):
        return [params[start_indices[i]:start_indices[i+1]].reshape(shape)
                for i, shape in enumerate(shapes)]
    return flat, unflatten

ames = pd.read_csv('https://github.com/kcarnold/AmesHousing/blob/master/data/ames.csv.gz?raw=true', compression="gzip")
ames['price'] = ames["Sale_Price"] / 100_000 # Make `price` be in units of $100k, to be easier to interpret.

# Create price categories (0, 1, or 2) for the classification task
n_classes = 3
ames['price_rank'] = ames['price'].rank(pct=True)
ames['price_bin'] = pd.cut(ames['price_rank'], bins=n_classes, labels=False)

# Prepare features and target
feature_names = ['Longitude', 'Latitude']
X = ames[feature_names].values
y = ames['price_bin'].values

# standardize the features, to make the optimization more stable
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
X = (X - X_mean) / X_std

# Split data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

plt.hist(y_train);
ames['price_bin'].value_counts()

price_bin
0    981
1    980
2    969
Name: count, dtype: int64

print(f"train: X.shape={X_train.shape}, y.shape={y_train.shape}")
print(f"valid: X.shape={X_valid.shape}, y.shape={y_valid.shape}")

train: X.shape=(2344, 2), y.shape=(2344,)
valid: X.shape=(586, 2), y.shape=(586,)

plt.hist(X_train.flatten(), bins=30);

# Initialize parameters
np.random.seed(42)
n_samples, n_features = X_train.shape

initial_weights = np.random.standard_normal(size=...)
initial_bias = np.random.standard_normal()

def linear_forward(X, weights, bias):
    return X @ weights + bias

def compute_mse_loss(y_true, y_pred):
    return ((y_true - y_pred) ** 2).mean()

def loss_given_params(params, X, y_true):
    weights, bias = unpack_params(params)
    y_pred = linear_forward(X, weights, bias)
    return compute_mse_loss(...)

initial_params, unpack_params = flatten([initial_weights, initial_bias])
initial_loss = loss_given_params(initial_params, X_train, y_train)
optimization_result = scipy.optimize.minimize(loss_given_params, initial_params,
                                              args=(X_train, y_train))
fitted_weights_np, fitted_bias_np = unpack_params(optimization_result.x)
print(f"Initial loss: {initial_loss:.2f}")
print(f"Final loss: {optimization_result.fun:.2f}")
print(f"Fitted weights: {fitted_weights_np}")
print(f"Fitted bias: {fitted_bias_np}")

Initial loss: 1.33
Final loss: 0.56
Fitted weights: [-0.26364467  0.17475395]
Fitted bias: 0.9886359257077487

y_pred_valid = linear_forward(X_valid, fitted_weights_np, fitted_bias_np).round()
y_pred_valid[:10]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

accuracy = (y_pred_valid == y_valid).mean()
print(f"Validation accuracy for linear regression 'classifier': {accuracy:.2f}")

Validation accuracy for linear regression 'classifier': 0.31

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train, dtype=torch.float32)
X_valid_torch = torch.tensor(X_valid, dtype=torch.float32)
# Note: still treating as regression, so y is float
y_train_torch = torch.tensor(y_train, dtype=torch.float32)
y_valid_torch = torch.tensor(y_valid, dtype=torch.float32)

# Create a linear model. Fill in the correct in_features and out_features.
# Since we're treating this as regression (predicting a single number), out_features should be 1.
model = nn.Linear(in_features=..., out_features=..., bias=True)

# Training loop (we'll talk next week about how this works)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

n_epochs = 100
for epoch in range(n_epochs):
    # Forward pass
    y_pred = model(X_train_torch).flatten()  # flatten because model outputs shape (N, 1) but y is shape (N,)
    loss = F.mse_loss(y_pred, y_train_torch)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f'Epoch {epoch:02d}, Loss: {loss.item():.4f}')

# np.eye is the identity matrix; each row is a one-hot vector.
# this code pulls out the one-hot vectors corresponding to the target values
y_train_onehot = np.eye(n_classes)[y_train]
y_valid_onehot = np.eye(n_classes)[y_valid]

y_train_onehot[:5]

array([[0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

def softmax(logits):
    """Convert logits to probabilities."""
    num_samples, num_classes = logits.shape

    max_logit = logits.max(axis=1, keepdims=True)

    exp_logits = ...
    assert exp_logits.shape == (num_samples, num_classes)

    sum_exp_logits = exp_logits.sum(axis=1, keepdims=True)
    assert sum_exp_logits.shape == (num_samples, 1)
    return ...

def cross_entropy_loss(y_true_onehot, probs):
    """Compute cross entropy loss."""
    # Strategy:
    # - Compute log probabilities
    # - Multiply by one-hot vectors and sum, to extract the log probabilities of the true classes
    # - Take the negative to get the loss
    # - Average the loss over the samples
    #
    log_probs = np.log(probs + 1e-6) # add a small value to avoid log(0)
    loss_per_sample = -np.sum(y_true_onehot * log_probs, axis=1)
    assert loss_per_sample.shape == (len(y_true_onehot),)
    return ...

## Now let's put it all together: logistic regression in NumPy.
# Compare this with Part 1.A above. What changed?

np.random.seed(42)
n_samples, n_features = X_train.shape

# Weights and bias now output n_classes values instead of 1
# initial_weights = np.random.standard_normal(size=(?, ?))
initial_weights = np.random.standard_normal(size=(n_features, n_classes))
# initial_bias = np.random.standard_normal(size=?)
initial_bias = np.random.standard_normal(size=n_classes)

def linear_forward(X, weights, bias):
    return X @ weights + bias

def loss_given_params(params, X, y_true_onehot):
    weights, bias = unpack_params(params)
    logits = linear_forward(X, weights, bias)
    probs = softmax(logits)
    return cross_entropy_loss(y_true_onehot, probs)

initial_params, unpack_params = flatten([initial_weights, initial_bias])
initial_loss = loss_given_params(initial_params, X_train, y_train_onehot)
optimization_result = scipy.optimize.minimize(loss_given_params, initial_params,
                                              args=(X_train, y_train_onehot))
fitted_weights_np, fitted_bias_np = unpack_params(optimization_result.x)
print(f"Initial loss: {initial_loss:.2f}")
print(f"Final loss: {optimization_result.fun:.2f}")
print(f"Fitted weights: {fitted_weights_np}")
print(f"Fitted bias: {fitted_bias_np}")

y_pred_valid_logits = linear_forward(X_valid, fitted_weights_np, fitted_bias_np)
y_pred_valid = y_pred_valid_logits.argmax(axis=1)
y_pred_valid[:10]

array([0, 1, 2, 2, 0, 2, 0, 1, 2, 2])

accuracy = (y_pred_valid == y_valid).mean()
print(f"Validation accuracy for logistic regression: {accuracy:.2f}")

Validation accuracy for logistic regression: 0.52

y_train_longtensor = torch.tensor(y_train)
y_valid_longtensor = torch.tensor(y_valid)

linear_layer = nn.Linear(in_features=..., out_features=..., bias=True)
logits = linear_layer(X_train_torch)
logits.shape

torch.Size([2344, 3])

probs = logits.softmax(axis=1)
probs.shape

torch.Size([2344, 3])

probs_sums = probs.sum(axis=...)
probs_sums.shape

torch.Size([2344])

probs_sums[:10]

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000], grad_fn=<SliceBackward0>)

# Direct approach, analogous to the NumPy code above
y_train_onehot_torch = torch.tensor(y_train_onehot, dtype=torch.int64)
log_probs = probs.log()
-(y_train_onehot_torch * log_probs).sum(axis=1).mean()

tensor(1.1906, grad_fn=<NegBackward0>)

# cross-entropy loss function in PyTorch uses logits (without softmax) and class indices (without one-hot)
F.cross_entropy(logits, y_train_longtensor)

tensor(1.1906, grad_fn=<NllLossBackward0>)

 # see also F.one_hot if you're curious.
F.nll_loss(log_probs, y_train_longtensor)
F.nll_loss(logits.log_softmax(axis=1), y_train_longtensor)

# using the "object-oriented" interface
loss_fn = nn.CrossEntropyLoss()
loss_fn(logits, y_train_longtensor)

# your code here

Epoch 00, Loss: 1.3116
Epoch 10, Loss: 1.1680
Epoch 20, Loss: 1.0995

Epoch 30, Loss: 1.0654
Epoch 40, Loss: 1.0468
Epoch 50, Loss: 1.0359
Epoch 60, Loss: 1.0292
Epoch 70, Loss: 1.0249
Epoch 80, Loss: 1.0220
Epoch 90, Loss: 1.0202

n_hidden = 100
model = nn.Sequential(
    nn.Linear(in_features=n_features, out_features=n_hidden, bias=True),
    nn.ReLU(),
    nn.Linear(in_features=n_hidden, out_features=n_classes, bias=True),
)

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

n_epochs = 1000
for epoch in range(n_epochs):
    # Forward pass
    logits = model(X_train_torch)
    loss = F.cross_entropy(logits, y_train_longtensor)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss.item():.4f}')

# Evaluate on validation set
y_valid_longtensor = torch.tensor(y_valid)  # make sure this is defined
logits_valid = model(X_valid_torch)
y_pred_valid = logits_valid.argmax(dim=1)
accuracy = (y_pred_valid == y_valid_longtensor).float().mean()
print(f'Validation accuracy: {accuracy.item():.2f}')

From Linear Regression in NumPy to Logistic Regression in PyTorch¶

Goals¶

Course Objectives Addressed¶

Setup¶

Basic EDA¶

Part 1: Classification the wrong way (using linear regression)¶

1.A Using NumPy¶

1.B: Using PyTorch¶

2.A Using NumPy¶

2.B Using PyTorch¶

Setting up the linear layer¶

Softmax¶

Cross-entropy loss¶

Full PyTorch Implementation¶

Looking ahead: a multi-layer network¶

Analysis¶