In [ ]:
#Pre-processing (Testing/Training set)
test_size = Xtot.shape[0] // 2
print('Split: {} testing and {} training samples'.format(test_size, ytot.size - test_size))
perm = np.random.permutation(ytot.size)
X_test  = Xtot[perm[:test_size]]
X_train = Xtot[perm[test_size:]]
y_test  = ytot[perm[:test_size]]
y_train = ytot[perm[test_size:]]

In [ ]:
#Regression using Scikit Learn
from sklearn import linear_model, metrics, ensemble

# model = linear_model.LinearRegression()
# model = linear_model.Ridge (alpha = 1e3)
# model = linear_model.SGDRegressor() #is really bad
# model = linear_model.BayesianRidge()
# model = linear_model.LogisticRegression() #one of the most popular linear technique

n_estimators = 50
to_test = np.append(range(1, 20), range(20, n_estimators + 10,10))
model = ensemble.RandomForestRegressor() #fucking good!

mse = []
mae = []
for n in to_test:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    mse = np.append(mse, metrics.mean_squared_error(y_test, y_pred))
    mae = np.append(mae, metrics.mean_absolute_error(y_test,y_pred))

In [ ]:
#Performance visualization

#MSE with respect number of trees
plt.figure(figsize=(15, 5))
ax = plt.subplot(1, 2, 1)
ax.set_title('Optimal Number of Trees')
ax.set_xlabel('Number of estimators')
ax.set_ylabel('Mean Squared Error')
plt.plot(to_test, mse)
ax = plt.subplot(1, 2, 2)
ax.set_title('On average, of how many units is the prediction from the truth?')
ax.set_xlabel('Number of estimators')
ax.set_ylabel('Mean Absolute Error')
plt.plot(to_test, mae)
# plt.savefig('One_model_all_stations.png', bbox_inches='tight')
plt.show()

#Error Distribution
y_pred=np.expand_dims(y_pred,axis=1) #for paired_distances
absolute_error = metrics.pairwise.paired_distances(y_test,y_pred,metric='l1')

fig, ax = plt.subplots(figsize=(15, 5))
plt.title('Absolute Error Distribution (for 50 trees)')
plt.xlabel('Absolute error',fontsize=15)
plt.ylabel('Prediction counts',fontsize=15)
counts, bins, patches = ax.hist(absolute_error,bins=range(int(absolute_error.max())))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for count, x in zip(counts, bin_centers):
    # Label the raw counts
    ax.annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'), xytext=(0, 40), textcoords='offset points', va='top', ha='center')
# plt.savefig('One_model_all_stations2.png', bbox_inches='tight')
plt.show()