import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mpl.rcParams['figure.figsize'] = (10, 8)
data = pd.read_csv(
"AirQuality_Sensors.csv",
parse_dates={"Date_Time": ["Date", "Time"]},
index_col="Date_Time",
na_values="-200"
)
data.info()
Let's check those dates. Always a good idea...
plt.scatter(np.arange(len(data)), data.index)
data = pd.read_csv(
"AirQuality_Sensors.csv",
parse_dates={"Date_Time": ["Date", "Time"]},
index_col="Date_Time",
dayfirst=True,
na_values="-200"
)
plt.scatter(np.arange(len(data)), data.index)
#Select the data of interest to work with
df = data[['C6H6(GT)','PT08.S1(CO)','PT08.S2(NMHC)','PT08.S3(NOx)','PT08.S4(NO2)',
'PT08.S5(O3)','T','RH','AH']]
#rename columns
columns={'C6H6(GT)':'C6H6','PT08.S1(CO)':'CO','PT08.S2(NMHC)':'NMHC','PT08.S3(NOx)':'NOX','PT08.S4(NO2)':'NO2','PT08.S5(O3)':'O3'}
df = df.rename(columns=columns)
df.head()
df.info()
time_series_plot_kwargs = dict(
subplots=True,
marker='.',
markersize=.5,
linestyle="",
figsize=(10, 8))
df.plot(**time_series_plot_kwargs);
What do you observe about:
df_simple = df.iloc[:, :-3]
sns.pairplot(df_simple, x_vars=['C6H6'])
# See https://github.com/mwaskom/seaborn/issues/2472
sns.displot(
df_simple.melt(),
x="value", col="variable",
common_bins=False,
col_wrap=3, facet_kws={'sharex': False})
We need to split the data in two ways: first, to separate the features (cheap) from the target (expensive), and second, to separate train from test.
target_name = 'C6H6'
X = df.drop([target_name], axis=1)
y = df[target_name]
Now to split train from test. How should we do this? Approach 1:
(
X_train, X_test,
y_train, y_test
) = train_test_split(X, y, test_size=0.2, random_state=12345)
X_train.plot(**time_series_plot_kwargs);
X_test.plot(**dict(time_series_plot_kwargs, markersize=1));
In light of those observations, let's use a different approach:
(
X_train, X_test,
y_train, y_test
) = train_test_split(X, y, test_size=0.2, shuffle=False)
# Note that `random_state` no longer has any effect here.
X_train.plot(**time_series_plot_kwargs);
X_test.plot(**dict(time_series_plot_kwargs));
sns.heatmap(df.isnull())
df.isnull().sum(axis=1).plot()
IterativeImputer do?¶# IterativeImputer is still experimental: https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
df_imputed = IterativeImputer().fit_transform(df)
row_had_missing = df.isna().sum(axis=1) > 0
pd.DataFrame(df_imputed[row_had_missing], columns=df.columns)
What do you notice about the values that were imputed?
Notice how we used the entire original dataset in the imputation. List the two potential issues that could come up when doing this. (Assume, hypothetically, that there were actually different features missing in different cases.)
X_train_imputed = IterativeImputer().fit_transform(X_train)
row_had_missing = X_train.isna().sum(axis=1) > 0
pd.DataFrame(X_train_imputed[row_had_missing], columns=X_train.columns)
Were these values the same or different than when we used the full dataframe (df)? (Note that we have one fewer column also.)
target_not_missing = ~y.isna()
X_full = X[target_not_missing]
y_full = y[target_not_missing]
X_full.isna().sum()
(
X_train, X_test,
y_train, y_test
) = train_test_split(X_full, y_full, test_size=0.2, shuffle=False)
assert not any(np.any(arr.isna()) for arr in [X_train, X_test, y_train, y_test])
It turns out that no features are missing after this drop.
scaler1 = MinMaxScaler()
scaled_1 = scaler1.fit_transform(df)
pd.DataFrame(scaled_1, columns=df.columns).describe().round(3)
scaler1.data_max_
scaler2 = StandardScaler()
scaled_2 = scaler2.fit_transform(df)
pd.DataFrame(scaled_2, columns=df.columns).describe().round(3)
scaler2.mean_, scaler2.var_
MinMaxScaler do to the min and max?StandardScaler do to the mean and std?std) after the MinMaxScaler?std) after the StandardScaler?preproc_pipeline = make_pipeline(
StandardScaler(),
IterativeImputer()
)
X_train_scaled = preproc_pipeline.fit_transform(X_train, y_train)
X_train_scaled.shape
X_test_scaled = preproc_pipeline.transform(X_test)
linreg = LinearRegression().fit(X_train, y_train)
r2_score(y_test, linreg.predict(X_test))
Random forest.
from sklearn.ensemble import RandomForestRegressor
rfreg = RandomForestRegressor().fit(X_train, y_train)
r2_score(y_test, rfreg.predict(X_test))
Splines.
from sklearn.preprocessing import SplineTransformer
spline_model = make_pipeline(
SplineTransformer(n_knots=3),
LinearRegression()
).fit(X_train, y_train)
r2_score(y_test, spline_model.predict(X_test))