In [ ]:
#Out of the for loop on stations
time_period = Xtot[(Xtot[:,Xtot.shape[1]-1]==0)].shape[0]
test_size = time_period // 2
perm = np.random.permutation(time_period)
mse_tot = []
mae_tot = []
In [ ]:
for k in int_to_station.keys():
#Selecting the station to regress on
boolean_station = (Xtot[:,Xtot.shape[1]-1]==k)
X = Xtot[boolean_station]
y = ytot[boolean_station]
#Pre-processing (Testing/Training set)
X_test = X[perm[:test_size]]
X_train = X[perm[test_size:]]
y_test = y[perm[:test_size]]
y_train = y[perm[test_size:]]
#Regression using Scikit Learn
model = ensemble.RandomForestRegressor(n_estimators=20) #fucking good!
model.fit(X_train, y_train.ravel())
y_pred = model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
mae = metrics.mean_absolute_error(y_test,y_pred)
mse_tot = np.append(mse_tot, mse)
mae_tot = np.append(mae_tot, mae)
In [ ]:
#Visualization
temp = pd.DataFrame(mse_tot, index=int_to_station.values(), columns=['mse'])
temp['mae'] = mae_tot
ax = temp.sort_values('mse').plot(kind='bar', figsize = (30,10), fontsize = 15)
ax.set_title('Regression Evaluation One model per Station For 20 Trees',fontsize=25)
ax.set_ylabel('Measure Value',fontsize=15)
ax.legend(fontsize=15)
plt.show()