In [ ]:
# Compute Boolean Vector to select among our data
boolean_selection = False
for d in week:
boolean_selection = boolean_selection | (data_to_regress.index.date==d)
inverse_boolean_selection = np.invert(boolean_selection)
###### Numerizing the entire dataframe ######
X = data_to_regress.loc[inverse_boolean_selection,:].as_matrix(columns=['Events','Mean_Temperature_F','Precipitation_In ','month','weekday','hour'])
y = data_to_regress.loc[inverse_boolean_selection,:].as_matrix(columns=['daily_variation'])
##### Pre-processing (Testing/Training set) #####
test_size = X.shape[0] // 2
print('Split: {} testing and {} training samples'.format(test_size, y.size - test_size))
perm = np.random.permutation(y.size)
X_train = X[perm[test_size:]]
y_train = y[perm[test_size:]]
#### Regression using Scikit Learn ####
from sklearn import linear_model, metrics, ensemble
model = ensemble.RandomForestRegressor(20)
model.fit(X_train, y_train.ravel())
In [ ]:
#Numerizing
X_test = data_to_regress.loc[boolean_selection,:].as_matrix(columns=['Events','Mean_Temperature_F','Precipitation_In ','month','weekday','hour'])
y_test = data_to_regress.loc[boolean_selection,:].as_matrix(columns=['daily_variation'])
#Prediction
y_pred = model.predict(X_test)
#Dataframe to compare
comparison = data_to_regress.loc[boolean_selection,:]
comparison ['prediction'] = pd.Series(y_pred, index=comparison.index)
#Performance
mse = metrics.mean_squared_error(y_test, y_pred)
mae = metrics.mean_absolute_error(y_test,y_pred)
print('mse: {:.4f}'.format(mse))
print('mae: {:.4f}'.format(mae))
# Visualization
n = 96
plt.figure(figsize=(15, 5))
plt.title('Comparison Prediction and Ground truth over the first day of the week',fontsize=15)
plt.plot(y_test[:n], '.', alpha=.7, markersize=10, label='ground truth')
plt.plot(y_pred[:n], '.', alpha=.7, markersize=10, label='prediction')
plt.legend()
plt.show()
# New kind of printing
plt.figure(figsize=(30,10));
plt.title('Comparison Prediction and Ground truth over a week',fontsize=25)
comparison.daily_variation.plot( grid=True,fontsize=15, label='ground truth');
comparison.prediction.plot( grid=True,fontsize=15, label='prediction');
plt.legend(fontsize=20)
plt.show()