import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeRegressor
import sklearn.tree
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

ames = pd.read_csv('https://github.com/kcarnold/AmesHousing/blob/master/data/ames.csv.gz?raw=true', compression="gzip")

ames.head()

ames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 81 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MS_SubClass         2930 non-null   object 
 1   MS_Zoning           2930 non-null   object 
 2   Lot_Frontage        2930 non-null   int64  
 3   Lot_Area            2930 non-null   int64  
 4   Street              2930 non-null   object 
 5   Alley               2930 non-null   object 
 6   Lot_Shape           2930 non-null   object 
 7   Land_Contour        2930 non-null   object 
 8   Utilities           2930 non-null   object 
 9   Lot_Config          2930 non-null   object 
 10  Land_Slope          2930 non-null   object 
 11  Neighborhood        2930 non-null   object 
 12  Condition_1         2930 non-null   object 
 13  Condition_2         2930 non-null   object 
 14  Bldg_Type           2930 non-null   object 
 15  House_Style         2930 non-null   object 
 16  Overall_Qual        2930 non-null   object 
 17  Overall_Cond        2930 non-null   object 
 18  Year_Built          2930 non-null   int64  
 19  Year_Remod_Add      2930 non-null   int64  
 20  Roof_Style          2930 non-null   object 
 21  Roof_Matl           2930 non-null   object 
 22  Exterior_1st        2930 non-null   object 
 23  Exterior_2nd        2930 non-null   object 
 24  Mas_Vnr_Type        1155 non-null   object 
 25  Mas_Vnr_Area        2930 non-null   int64  
 26  Exter_Qual          2930 non-null   object 
 27  Exter_Cond          2930 non-null   object 
 28  Foundation          2930 non-null   object 
 29  Bsmt_Qual           2930 non-null   object 
 30  Bsmt_Cond           2930 non-null   object 
 31  Bsmt_Exposure       2930 non-null   object 
 32  BsmtFin_Type_1      2930 non-null   object 
 33  BsmtFin_SF_1        2930 non-null   int64  
 34  BsmtFin_Type_2      2930 non-null   object 
 35  BsmtFin_SF_2        2930 non-null   int64  
 36  Bsmt_Unf_SF         2930 non-null   int64  
 37  Total_Bsmt_SF       2930 non-null   int64  
 38  Heating             2930 non-null   object 
 39  Heating_QC          2930 non-null   object 
 40  Central_Air         2930 non-null   object 
 41  Electrical          2930 non-null   object 
 42  First_Flr_SF        2930 non-null   int64  
 43  Second_Flr_SF       2930 non-null   int64  
 44  Low_Qual_Fin_SF     2930 non-null   int64  
 45  Gr_Liv_Area         2930 non-null   int64  
 46  Bsmt_Full_Bath      2930 non-null   int64  
 47  Bsmt_Half_Bath      2930 non-null   int64  
 48  Full_Bath           2930 non-null   int64  
 49  Half_Bath           2930 non-null   int64  
 50  Bedroom_AbvGr       2930 non-null   int64  
 51  Kitchen_AbvGr       2930 non-null   int64  
 52  Kitchen_Qual        2930 non-null   object 
 53  TotRms_AbvGrd       2930 non-null   int64  
 54  Functional          2930 non-null   object 
 55  Fireplaces          2930 non-null   int64  
 56  Fireplace_Qu        2930 non-null   object 
 57  Garage_Type         2930 non-null   object 
 58  Garage_Finish       2930 non-null   object 
 59  Garage_Cars         2930 non-null   int64  
 60  Garage_Area         2930 non-null   int64  
 61  Garage_Qual         2930 non-null   object 
 62  Garage_Cond         2930 non-null   object 
 63  Paved_Drive         2930 non-null   object 
 64  Wood_Deck_SF        2930 non-null   int64  
 65  Open_Porch_SF       2930 non-null   int64  
 66  Enclosed_Porch      2930 non-null   int64  
 67  Three_season_porch  2930 non-null   int64  
 68  Screen_Porch        2930 non-null   int64  
 69  Pool_Area           2930 non-null   int64  
 70  Pool_QC             2930 non-null   object 
 71  Fence               2930 non-null   object 
 72  Misc_Feature        106 non-null    object 
 73  Misc_Val            2930 non-null   int64  
 74  Mo_Sold             2930 non-null   int64  
 75  Year_Sold           2930 non-null   int64  
 76  Sale_Type           2930 non-null   object 
 77  Sale_Condition      2930 non-null   object 
 78  Sale_Price          2930 non-null   int64  
 79  Longitude           2930 non-null   float64
 80  Latitude            2930 non-null   float64
dtypes: float64(2), int64(33), object(46)
memory usage: 1.8+ MB

type(ames['Sale_Price'])

pandas.core.series.Series

ames['price'] = ames["Sale_Price"] / 100_000 # Make `price` be in units of $100k, to be easier to interpret.

def plot_data():
    # You don't have to know how this function works.
    plt.scatter(ames['Longitude'], ames['Latitude'], c=ames["price"], s=.5)
    plt.xlabel("Longitude"); plt.ylabel("Latitude")
    plt.colorbar(label="Sale Price ($100k)")
plot_data()

feature_names = ['Longitude', 'Latitude']
X = ames[feature_names].values
X.shape

(2930, 2)

y = ames['price'].values
y.shape

(2930,)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.2, random_state=42)

X_train.shape, y_train.shape

((2344, 2), (2344,))

X_valid.shape, y_valid.shape

((586, 2), (586,))

def plot_model(clf, fig=None):
    # Compute extents
    lat_min = ames.Latitude.min()
    lat_max = ames.Latitude.max()
    lon_min = ames.Longitude.min()
    lon_max = ames.Longitude.max()
    price_min = ames.price.min()
    price_max = ames.price.max()

    # Ask the classifier for predictions on a grid
    xx, yy = np.meshgrid(np.linspace(lon_min, lon_max, 250), np.linspace(lat_min, lat_max, 250))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    if fig is None:
        fig = plt.figure(figsize=plt.figaspect(2))

    # Left side: show the predictions in 2D. Superimpose the original data.
    ax = fig.add_subplot(2, 1, 1)
    surf = ax.contourf(xx, yy, Z, alpha=.5, cmap=plt.cm.viridis, vmin=price_min, vmax=price_max)
    ax.scatter(ames['Longitude'], ames['Latitude'], c=ames["price"], s=1, cmap='viridis', vmin=price_min, vmax=price_max)
    ax.set(xlabel="Longitude", ylabel="Latitude", title="2D contour view")
    fig.colorbar(surf, label="Sale Price ($100k)")

    # Right side: show the predictions in 3D
    ax = fig.add_subplot(2, 1, 2, projection='3d')
    ax.plot_surface(xx, yy, Z, alpha=.5, cmap=plt.cm.viridis, vmin=price_min, vmax=price_max)
    #ax.scatter(ames['Longitude'], ames['Latitude'], c=ames["price"], s=1, cmap='viridis', vmin=price_min, vmax=price_max)
    ax.set(title="3D view")

linreg = LinearRegression().fit(...)
print("Prediction equation:")
print('y_pred = '
    + ' + ' .join(f'{coef:.3f} * {name}' for coef, name in zip(linreg.coef_, feature_names))
    + f' + {linreg.intercept_:.3f}')

Prediction equation:
y_pred = -7.955 * Longitude + 11.886 * Latitude + -1242.742

example_home = X_valid[0]
example_home

array([-93.621065,  42.029038])

plot_model(linreg)

y_pred = linreg.predict(...)

# your code here

Prediction for first house in validation set: 1.55, Actual: 1.61, difference: 0.06

# your code here

Mean absolute error: 0.582
Mean squared error:  0.679
R2 score:            0.154

print(f"Training score: {linreg.score(X_train, y_train):.3f}")
print(f"Validation score: {linreg.score(X_valid, y_valid):.3f}")

Training score: 0.147
Validation score: 0.154

dtree_reg_small = DecisionTreeRegressor(max_depth=2, random_state=42)...
plt.figure(figsize=(20, 15))
sklearn.tree.plot_tree(dtree_reg_small, feature_names=feature_names, filled=True);

dtree_reg = DecisionTreeRegressor(random_state=42)...

print(sklearn.tree.export_text(dtree_reg, feature_names=feature_names, max_depth=2))

|--- Latitude <= 42.05
|   |--- Longitude <= -93.63
|   |   |--- Latitude <= 42.00
|   |   |   |--- truncated branch of depth 9
|   |   |--- Latitude >  42.00
|   |   |   |--- truncated branch of depth 22
|   |--- Longitude >  -93.63
|   |   |--- Latitude <= 42.04
|   |   |   |--- truncated branch of depth 28
|   |   |--- Latitude >  42.04
|   |   |   |--- truncated branch of depth 20
|--- Latitude >  42.05
|   |--- Longitude <= -93.65
|   |   |--- Longitude <= -93.66
|   |   |   |--- truncated branch of depth 12
|   |   |--- Longitude >  -93.66
|   |   |   |--- truncated branch of depth 24
|   |--- Longitude >  -93.65
|   |   |--- Longitude <= -93.63
|   |   |   |--- truncated branch of depth 22
|   |   |--- Longitude >  -93.63
|   |   |   |--- truncated branch of depth 16

plot_model(dtree_reg)

# your code here

Mean absolute error: 0.3832485153583618
Mean squared error: 0.4089356941066932

rf_reg = ...
print(f"We just fit a random forest with {rf_reg.n_estimators} trees.")

We just fit a random forest with 100 trees.

plot_model(rf_reg)

if False:
    from matplotlib.animation import FuncAnimation
    from IPython.display import HTML
    def frame(i):
        plt.clf()
        plot_model(rf_reg.estimators_[i], fig=fig)
        plt.title(f"Tree {i:03d}")
    fig = plt.figure(figsize=(16, 10))
    anim = FuncAnimation(fig=fig, func=frame, frames=min(10, len(rf_reg.estimators_)))
    # One of these two should work:
    display(HTML(anim.to_html5_video()))
    #display(HTML(anim.to_jshtml()))

# your code here

Mean absolute error: 0.28960648131845396
Mean squared error: 0.20209223072066412

	MS_SubClass	MS_Zoning	Lot_Frontage	Lot_Area	Street	Alley	Lot_Shape	Land_Contour	Utilities	Lot_Config	...	Fence	Misc_Feature	Misc_Val	Mo_Sold	Year_Sold	Sale_Type	Sale_Condition	Sale_Price	Longitude	Latitude
0	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	141	31770	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Corner	...	No_Fence	NaN	0	5	2010	WD	Normal	215000	-93.619754	42.054035
1	One_Story_1946_and_Newer_All_Styles	Residential_High_Density	80	11622	Pave	No_Alley_Access	Regular	Lvl	AllPub	Inside	...	Minimum_Privacy	NaN	0	6	2010	WD	Normal	105000	-93.619756	42.053014
2	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	81	14267	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Corner	...	No_Fence	Gar2	12500	6	2010	WD	Normal	172000	-93.619387	42.052659
3	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	93	11160	Pave	No_Alley_Access	Regular	Lvl	AllPub	Corner	...	No_Fence	NaN	0	4	2010	WD	Normal	244000	-93.617320	42.051245
4	Two_Story_1946_and_Newer	Residential_Low_Density	74	13830	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Inside	...	Minimum_Privacy	NaN	0	3	2010	WD	Normal	189900	-93.638933	42.060899

Regression in `scikit-learn`¶

Goals¶

Rationale¶

Documentation¶

Setup¶

Task¶

Part A: Linear regression¶

Part B: Decision tree regression¶

Part C: Random Forest regression¶

Analysis¶

Extension¶

Regression in scikit-learn¶

Goals¶

Rationale¶

Documentation¶

Setup¶

Task¶

Part A: Linear regression¶

Part B: Decision tree regression¶

Part C: Random Forest regression¶

Analysis¶

Extension¶

Regression in `scikit-learn`¶