In [ ]:
#Out of the for loop on stations
time_period = Xtot[(Xtot[:,Xtot.shape[1]-1]==0)].shape[0]
test_size = time_period // 2
perm = np.random.permutation(time_period)

mse_tot = []
mae_tot = []

In [ ]:
for k in int_to_station.keys():
    #Selecting the station to regress on
    boolean_station = (Xtot[:,Xtot.shape[1]-1]==k)
    X = Xtot[boolean_station]
    y = ytot[boolean_station]
    
    #Pre-processing (Testing/Training set)
    X_test  = X[perm[:test_size]]
    X_train = X[perm[test_size:]]
    y_test  = y[perm[:test_size]]
    y_train = y[perm[test_size:]]
    
    #Regression using Scikit Learn
    model = ensemble.RandomForestRegressor(n_estimators=20) #fucking good!
    
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    
    mse = metrics.mean_squared_error(y_test, y_pred)
    mae = metrics.mean_absolute_error(y_test,y_pred)
    
    mse_tot = np.append(mse_tot, mse)
    mae_tot = np.append(mae_tot, mae)

In [ ]:
#Visualization
temp = pd.DataFrame(mse_tot, index=int_to_station.values(), columns=['mse'])
temp['mae'] = mae_tot

ax = temp.sort_values('mse').plot(kind='bar', figsize = (30,10), fontsize = 15)
ax.set_title('Regression Evaluation One model per Station For 20 Trees',fontsize=25)
ax.set_ylabel('Measure Value',fontsize=15)
ax.legend(fontsize=15)
plt.show()