In [23]:
import pandas as pd
import numpy as np
import json
import graphviz
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
pd.set_option("display.max_rows",6)
%matplotlib inline
In [24]:
df_data = pd.read_csv(r'varsom_ml_preproc_3y.csv', index_col=0)
target_ = 'danger_level'
X = df_data.drop([target_, 'date'], axis=1)
y = df_data.filter([target_], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 222, test_size = 0.3)
In [25]:
print(df_data.columns.values)
df_data.describe()
Out[25]:
In [ ]:
In [26]:
print(df_data[df_data['danger_level']>3]['region_id'].unique())
are using Bagging (bootstrap aggregating) algorithm
bagging is an ensemble learning where we build each model using the same algorithm, but we train each learner on different set of data
are using Boosting (Ada Boost) algorithm
boosting is a variation of ensemble trees that strives to improve the learners by focusing on areas where the system is not performing well.
In [27]:
dec_tree = tree.DecisionTreeRegressor(random_state=222, max_depth = 3)
dec_tree.fit(X_train, y_train) # we're using the same data as in last linear model
predictions_dt = dec_tree.predict(X_test)
print(predictions_dt.shape, y_test.shape)
In [34]:
# Visualize the tree
dot_data = tree.export_graphviz(dec_tree, out_file=None,
feature_names=df_data.drop([target_, 'date'], axis=1).columns,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph.render('aval_danger_by_problem_dt_lev7')
graph
Out[34]:
The first avalanche problem dictates the danger level - that was expected :-)
We need to evaluate our model:
In [35]:
print('Decision tree R^2: %.4f' % dec_tree.score(X_test, y_test))
In [36]:
depths = range(1, 20)
tree_models = [tree.DecisionTreeRegressor(random_state=222, max_depth=d).fit(X_train, y_train) for d in depths]
tree_Rsquare = [f.score(X_test, y_test) for f in tree_models]
plt.plot(depths, tree_Rsquare, color = 'red')
plt.xlabel('Tree depth')
plt.ylabel('$R^2$')
Out[36]:
Looks like there is little gain when using a depth > 7.
In [37]:
# so let's create a tree with depth = 7
dec_tree = tree.DecisionTreeRegressor(random_state=222, max_depth = 7)
dec_tree.fit(X_train, y_train) # we're using the same data as in last linear model
predictions_dt = dec_tree.predict(X_test)
In [38]:
# Visualize the tree
dot_data = tree.export_graphviz(dec_tree, out_file=None,
feature_names=df_data.drop([target_, 'date'], axis=1).columns,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph.render('aval_danger_by_problem_dt2')
graph
Out[38]:
In [39]:
print('Decision tree R^2: %.4f' % dec_tree.score(X_test, y_test))
Now we see slight improvement in $R^{2}$.
Maybe we'll get better results with random forests and/or gradient boosted trees.
In [40]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(random_state = 422, max_depth = 7)
forest_reg.fit(X_train, y_train)
predictions_rf = forest_reg.predict(X_test)
In [41]:
print('Random forest R^2: %.4f' % forest_reg.score(X_test, y_test))
In [42]:
from sklearn.ensemble import GradientBoostingRegressor
grad_boost = GradientBoostingRegressor(random_state = 222, max_depth = 6)
grad_boost.fit(X_train, y_train)
predictions_gb = grad_boost.predict(X_test)
print('Gradient boosted tree R^2: %.4f' % grad_boost.score(X_test, y_test))
Again, very small difference between random forests and boosted trees.
Last thing we can check is imprtance of variables - if some of the feature are not as useful as the other in explaining the variability in our target variable, we cn exclude them in order to simplify our model.
In [43]:
feature_labels = np.array(df_data.drop([target_, 'date'], axis=1).columns)
#feature_labels = np.array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'])
importance = forest_reg.feature_importances_
feature_indexes_by_importance = importance.argsort()
for index in feature_indexes_by_importance:
print('{}-{:.2f}%'.format(feature_labels[index], (importance[index] *100.0)))
In [44]:
# excluding the variables where impotance is less than 1%
X = df_data[['avalanche_problem_1_distribution_id',
'avalanche_problem_1_destructive_size_ext_id',
'avalanche_problem_1_trigger_simple_id',
'avalanche_problem_1_probability_id',
'avalanche_problem_2_destructive_size_ext_id',
'avalanche_problem_2_probability_id']]
y = df_data['danger_level']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 222)
In [45]:
grad_boost = GradientBoostingRegressor(random_state = 222, max_depth = 7)
grad_boost.fit(X_train, y_train)
predictions_gb = grad_boost.predict(X_test)
In [46]:
print('Gradient boosted tree R^2: %.4f' % grad_boost.score(X_test, y_test))
In [ ]: