import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import scipy.optimize

# This utility function (inspired by https://github.com/HIPS/autograd/blob/master/autograd/misc/flatten.py)
# is useful for optimizing using scipy.optimize.minimize.
def flatten(arrs):
    """Return a 1D array containing the elements of the input arrays,
    and an unflatten function that takes a flattened array and returns
    the original arrays.
    """
    arrs = [np.asarray(arr) for arr in arrs]
    shapes = [arr.shape for arr in arrs]
    flat = np.concatenate([arr.ravel() for arr in arrs])
    start_indices = np.cumsum([0] + [arr.size for arr in arrs])
    def unflatten(params):
        return [params[start_indices[i]:start_indices[i+1]].reshape(shape)
                for i, shape in enumerate(shapes)]
    return flat, unflatten

ames = pd.read_csv('https://github.com/kcarnold/AmesHousing/blob/master/data/ames.csv.gz?raw=true', compression="gzip")
ames['price'] = ames["Sale_Price"] / 100_000 # Make `price` be in units of $100k, to be easier to interpret.

def plot_data():
    # You don't have to know how this function works.
    plt.scatter(ames['Longitude'], ames['Latitude'], c=ames["price"], s=.5)
    plt.xlabel("Longitude"); plt.ylabel("Latitude")
    plt.colorbar(label="Sale Price ($100k)")
plot_data()

feature_names = ['Longitude', 'Latitude']
X = ames[feature_names].values
X.shape

(2930, 2)

y = ames['price'].values
y.shape

(2930,)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.2, random_state=42)

X_train.shape, y_train.shape

((2344, 2), (2344,))

X_valid.shape, y_valid.shape

((586, 2), (586,))

def plot_model(prediction_fn, fig=None, prediction_args=()):
    # Compute extents
    lat_min = ames.Latitude.min()
    lat_max = ames.Latitude.max()
    lon_min = ames.Longitude.min()
    lon_max = ames.Longitude.max()
    price_min = ames.price.min()
    price_max = ames.price.max()

    # Ask the classifier for predictions on a grid
    xx, yy = np.meshgrid(np.linspace(lon_min, lon_max, 250), np.linspace(lat_min, lat_max, 250))
    Z = prediction_fn(np.c_[xx.ravel(), yy.ravel()], *prediction_args).reshape(xx.shape)

    if fig is None:
        fig = plt.figure(figsize=plt.figaspect(2))

    # Left side: show the predictions in 2D. Superimpose the original data.
    ax = fig.add_subplot(2, 1, 1)
    surf = ax.contourf(xx, yy, Z, alpha=.5, cmap=plt.cm.viridis, vmin=price_min, vmax=price_max)
    ax.scatter(ames['Longitude'], ames['Latitude'], c=ames["price"], s=1, cmap='viridis', vmin=price_min, vmax=price_max)
    ax.set(xlabel="Longitude", ylabel="Latitude", title="2D contour view")
    fig.colorbar(surf, label="Sale Price ($100k)")

    # Right side: show the predictions in 3D
    ax = fig.add_subplot(2, 1, 2, projection='3d')
    ax.plot_surface(xx, yy, Z, alpha=.5, cmap=plt.cm.viridis, vmin=price_min, vmax=price_max)
    #ax.scatter(ames['Longitude'], ames['Latitude'], c=ames["price"], s=1, cmap='viridis', vmin=price_min, vmax=price_max)
    ax.set(title="3D view")

np.random.seed(42)

weights = np.random.standard_normal(size=...)
bias = np.random.standard_normal()
weights, bias

(array([ 0.49671415, -0.1382643 ]), 0.6476885381006925)

x_i = X_train[0] # example data point
y_pred_i = ...
y_pred_i

np.float64(-51.67595519554893)

def linreg_forward(X, weights, bias):
    return X @ weights + bias

y_pred_train = linreg_forward(X_train, weights, bias)
y_pred_train.shape

(2344,)

plot_model(linreg_forward, prediction_args=(weights, bias))

def compute_mse_loss(y_true, y_pred):
    return ...

mse_loss = compute_mse_loss(..., ...)
print("MSE loss:", mse_loss)

MSE loss: 2858.9442548850384

def loss_given_params(params, X, y_true):
    weights, bias = unpack_params(params)
    y_pred = ...
    return compute_mse_loss(...)

initial_params, unpack_params = flatten([weights, bias])
loss = loss_given_params(initial_params, X_train, y_train)
print("loss:", loss) # should be the same as mse_loss above

loss: 2858.9442548850384

optimization_result = scipy.optimize.minimize(loss_given_params, initial_params, args=(X_train, y_train))
fitted_params = optimization_result.x
weights, bias = unpack_params(fitted_params)
print("Fitted weights:", weights)
print("Fitted bias:", bias)

Fitted weights: [-8.0546004  11.85138754]
Fitted bias: -1250.630305034187

weights, bias = unpack_params(fitted_params)
plot_model(linreg_forward, prediction_args=(weights, bias))

Multiple Linear Regression, the Hard Way¶

Goals¶

Setup¶

Task¶

Part A: Linear regression¶

Analysis¶