import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


mpl.rcParams['figure.figsize'] = (10, 8)


data = pd.read_csv(
    "AirQuality_Sensors.csv",
    parse_dates={"Date_Time": ["Date", "Time"]},
    index_col="Date_Time",
    na_values="-200"
)
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9357 entries, 2004-10-03 18:00:00 to 2005-04-04 14:00:00
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CO(GT)         7674 non-null   float64
 1   PT08.S1(CO)    8991 non-null   float64
 2   NMHC(GT)       914 non-null    float64
 3   PT08.S2(NMHC)  8991 non-null   float64
 4   NOx(GT)        7718 non-null   float64
 5   PT08.S3(NOx)   8991 non-null   float64
 6   NO2(GT)        7715 non-null   float64
 7   PT08.S4(NO2)   8991 non-null   float64
 8   C6H6(GT)       8991 non-null   float64
 9   PT08.S5(O3)    8991 non-null   float64
 10  T              8991 non-null   float64
 11  RH             8991 non-null   float64
 12  AH             8991 non-null   float64
dtypes: float64(13)
memory usage: 1023.4 KB


plt.scatter(np.arange(len(data)), data.index)

<matplotlib.collections.PathCollection at 0x1245a69a0>


data = pd.read_csv(
    "AirQuality_Sensors.csv",
    parse_dates={"Date_Time": ["Date", "Time"]},
    index_col="Date_Time",
    dayfirst=True,
    na_values="-200"
)
plt.scatter(np.arange(len(data)), data.index)

<matplotlib.collections.PathCollection at 0x1247023a0>


#Select the data of interest to work with
df = data[['C6H6(GT)','PT08.S1(CO)','PT08.S2(NMHC)','PT08.S3(NOx)','PT08.S4(NO2)', 
      'PT08.S5(O3)','T','RH','AH']]

#rename columns
columns={'C6H6(GT)':'C6H6','PT08.S1(CO)':'CO','PT08.S2(NMHC)':'NMHC','PT08.S3(NOx)':'NOX','PT08.S4(NO2)':'NO2','PT08.S5(O3)':'O3'}
df = df.rename(columns=columns)
df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9357 entries, 2004-03-10 18:00:00 to 2005-04-04 14:00:00
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   C6H6    8991 non-null   float64
 1   CO      8991 non-null   float64
 2   NMHC    8991 non-null   float64
 3   NOX     8991 non-null   float64
 4   NO2     8991 non-null   float64
 5   O3      8991 non-null   float64
 6   T       8991 non-null   float64
 7   RH      8991 non-null   float64
 8   AH      8991 non-null   float64
dtypes: float64(9)
memory usage: 731.0 KB


time_series_plot_kwargs = dict(
    subplots=True,
    marker='.',
    markersize=.5,
    linestyle="",
    figsize=(10, 8))
df.plot(**time_series_plot_kwargs);


df_simple = df.iloc[:, :-3]
sns.pairplot(df_simple, x_vars=['C6H6'])

<seaborn.axisgrid.PairGrid at 0x122266a00>


# See https://github.com/mwaskom/seaborn/issues/2472
sns.displot(
    df_simple.melt(),
    x="value", col="variable",
    common_bins=False,
    col_wrap=3, facet_kws={'sharex': False})

<seaborn.axisgrid.FacetGrid at 0x1224a9fd0>


target_name = 'C6H6'
X = df.drop([target_name], axis=1)
y = df[target_name]


(
    X_train, X_test,
    y_train, y_test
) = train_test_split(X, y, test_size=0.2, random_state=12345)


X_train.plot(**time_series_plot_kwargs);


X_test.plot(**dict(time_series_plot_kwargs, markersize=1));


(
    X_train, X_test,
    y_train, y_test
) = train_test_split(X, y, test_size=0.2, shuffle=False)
# Note that `random_state` no longer has any effect here.


X_train.plot(**time_series_plot_kwargs);


X_test.plot(**dict(time_series_plot_kwargs));


sns.heatmap(df.isnull())

<AxesSubplot:ylabel='Date_Time'>


df.isnull().sum(axis=1).plot()

<AxesSubplot:xlabel='Date_Time'>


# IterativeImputer is still experimental: https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


df_imputed = IterativeImputer().fit_transform(df)


row_had_missing = df.isna().sum(axis=1) > 0
pd.DataFrame(df_imputed[row_had_missing], columns=df.columns)


X_train_imputed = IterativeImputer().fit_transform(X_train)
row_had_missing = X_train.isna().sum(axis=1) > 0
pd.DataFrame(X_train_imputed[row_had_missing], columns=X_train.columns)


target_not_missing = ~y.isna()
X_full = X[target_not_missing]
y_full = y[target_not_missing]
X_full.isna().sum()

CO      0
NMHC    0
NOX     0
NO2     0
O3      0
T       0
RH      0
AH      0
dtype: int64


(
    X_train, X_test,
    y_train, y_test
) = train_test_split(X_full, y_full, test_size=0.2, shuffle=False)
assert not any(np.any(arr.isna()) for arr in [X_train, X_test, y_train, y_test])


scaler1 = MinMaxScaler()
scaled_1 = scaler1.fit_transform(df)
pd.DataFrame(scaled_1, columns=df.columns).describe().round(3)


scaler1.data_max_

array([6.370e+01, 2.040e+03, 2.214e+03, 2.683e+03, 2.775e+03, 2.523e+03,
       4.460e+01, 8.870e+01, 2.231e+00])


scaler2 = StandardScaler()
scaled_2 = scaler2.fit_transform(df)
pd.DataFrame(scaled_2, columns=df.columns).describe().round(3)


scaler2.mean_, scaler2.var_

(array([1.00831053e+01, 1.09983317e+03, 9.39153376e+02, 8.35493605e+02,
        1.45626460e+03, 1.02290613e+03, 1.83178289e+01, 4.92342009e+01,
        1.02553027e+00]),
 array([5.54936407e+01, 4.71185014e+04, 7.11910924e+04, 6.59478002e+04,
        1.19845813e+05, 1.58772067e+05, 7.79975923e+01, 2.99841412e+02,
        1.63046484e-01]))


preproc_pipeline = make_pipeline(
    StandardScaler(),
    IterativeImputer()
)
X_train_scaled = preproc_pipeline.fit_transform(X_train, y_train)
X_train_scaled.shape

(7192, 8)


X_test_scaled = preproc_pipeline.transform(X_test)


linreg = LinearRegression().fit(X_train, y_train)
r2_score(y_test, linreg.predict(X_test))

0.933122964342833


from sklearn.ensemble import RandomForestRegressor


rfreg = RandomForestRegressor().fit(X_train, y_train)
r2_score(y_test, rfreg.predict(X_test))

0.9998662770896344


from sklearn.preprocessing import SplineTransformer
spline_model = make_pipeline(
    SplineTransformer(n_knots=3),
    LinearRegression()
).fit(X_train, y_train)
r2_score(y_test, spline_model.predict(X_test))

0.9999772685857248

	C6H6	CO	NMHC	NOX	NO2	O3	T	RH	AH
Date_Time
2004-03-10 18:00:00	11.9	1360.0	1046.0	1056.0	1692.0	1268.0	13.6	48.9	0.7578
2004-03-10 19:00:00	9.4	1292.0	955.0	1174.0	1559.0	972.0	13.3	47.7	0.7255
2004-03-10 20:00:00	9.0	1402.0	939.0	1140.0	1555.0	1074.0	11.9	54.0	0.7502
2004-03-10 21:00:00	9.2	1376.0	948.0	1092.0	1584.0	1203.0	11.0	60.0	0.7867
2004-03-10 22:00:00	6.5	1272.0	836.0	1205.0	1490.0	1110.0	11.2	59.6	0.7888

	C6H6	CO	NMHC	NOX	NO2	O3	T	RH	AH
0	10.083105	1099.833166	939.153376	835.493605	1456.264598	1022.906128	18.317829	49.234201	1.02553
1	10.083105	1099.833166	939.153376	835.493605	1456.264598	1022.906128	18.317829	49.234201	1.02553
2	10.083105	1099.833166	939.153376	835.493605	1456.264598	1022.906128	18.317829	49.234201	1.02553
3	10.083105	1099.833166	939.153376	835.493605	1456.264598	1022.906128	18.317829	49.234201	1.02553
4	10.083105	1099.833166	939.153376	835.493605	1456.264598	1022.906128	18.317829	49.234201	1.02553
...	...	...	...	...	...	...	...	...	...
361	10.083105	1099.833166	939.153376	835.493605	1456.264598	1022.906128	18.317829	49.234201	1.02553
362	10.083105	1099.833166	939.153376	835.493605	1456.264598	1022.906128	18.317829	49.234201	1.02553
363	10.083105	1099.833166	939.153376	835.493605	1456.264598	1022.906128	18.317829	49.234201	1.02553
364	10.083105	1099.833166	939.153376	835.493605	1456.264598	1022.906128	18.317829	49.234201	1.02553
365	10.083105	1099.833166	939.153376	835.493605	1456.264598	1022.906128	18.317829	49.234201	1.02553

	CO	NMHC	NOX	NO2	O3	T	RH	AH
0	1101.245802	961.086745	848.404719	1540.220402	1023.434837	20.382623	48.978348	1.124781
1	1101.245802	961.086745	848.404719	1540.220402	1023.434837	20.382623	48.978348	1.124781
2	1101.245802	961.086745	848.404719	1540.220402	1023.434837	20.382623	48.978348	1.124781
3	1101.245802	961.086745	848.404719	1540.220402	1023.434837	20.382623	48.978348	1.124781
4	1101.245802	961.086745	848.404719	1540.220402	1023.434837	20.382623	48.978348	1.124781
...	...	...	...	...	...	...	...	...
275	1101.245802	961.086745	848.404719	1540.220402	1023.434837	20.382623	48.978348	1.124781
276	1101.245802	961.086745	848.404719	1540.220402	1023.434837	20.382623	48.978348	1.124781
277	1101.245802	961.086745	848.404719	1540.220402	1023.434837	20.382623	48.978348	1.124781
278	1101.245802	961.086745	848.404719	1540.220402	1023.434837	20.382623	48.978348	1.124781
279	1101.245802	961.086745	848.404719	1540.220402	1023.434837	20.382623	48.978348	1.124781

	C6H6	CO	NMHC	NOX	NO2	O3	T	RH	AH
count	8991.000	8991.000	8991.000	8991.000	8991.000	8991.000	8991.000	8991.000	8991.000
mean	0.157	0.325	0.304	0.217	0.407	0.348	0.435	0.504	0.411
std	0.117	0.156	0.146	0.109	0.156	0.173	0.190	0.218	0.197
min	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000
25%	0.068	0.208	0.192	0.142	0.304	0.222	0.295	0.335	0.270
50%	0.127	0.299	0.287	0.205	0.410	0.322	0.424	0.508	0.396
75%	0.219	0.419	0.400	0.274	0.505	0.457	0.566	0.670	0.552
max	1.000	1.000	1.000	1.000	1.000	1.000	1.000	1.000	1.000

	C6H6	CO	NMHC	NOX	NO2	O3	T	RH	AH
count	8991.000	8991.000	8991.000	8991.000	8991.000	8991.000	8991.000	8991.000	8991.000
mean	-0.000	-0.000	-0.000	0.000	0.000	0.000	-0.000	-0.000	0.000
std	1.000	1.000	1.000	1.000	1.000	1.000	1.000	1.000	1.000
min	-1.340	-2.086	-2.084	-2.000	-2.615	-2.013	-2.289	-2.312	-2.082
25%	-0.763	-0.750	-0.767	-0.691	-0.662	-0.731	-0.738	-0.776	-0.715
50%	-0.253	-0.170	-0.113	-0.115	0.019	-0.150	-0.059	0.021	-0.075
75%	0.526	0.604	0.663	0.522	0.629	0.629	0.689	0.766	0.714
max	7.197	4.331	4.778	7.194	3.809	3.765	2.976	2.279	2.985

Time Series Modeling and EDA¶

Load Data¶

Observe!¶

Observe!¶

Splitting Data¶

Observe!¶

Observe!¶

Dealing with Missing Values¶

What does `IterativeImputer` do?¶

Observe!¶

Data Leakage¶

Observe!¶

Drop missing targets¶

Feature Scaling¶

Observe!¶

Pipelines¶

Baselines¶

Time Series Modeling and EDA¶

Load Data¶

Observe!¶

Observe!¶

Splitting Data¶

Observe!¶

Observe!¶

Dealing with Missing Values¶

What does IterativeImputer do?¶

Observe!¶

Data Leakage¶

Observe!¶

Drop missing targets¶

Feature Scaling¶

Observe!¶

Pipelines¶

Baselines¶

What does `IterativeImputer` do?¶