DECISION TREE

  • is another method we can use for finding relationship between target and one or more predictors
  • decision trees can be used for both categorical and continous targets (so both classification and regression tasks - today we'll be focusing on regression trees)
  • the main idea is to create a tree of decisions that best partitions the data
  • creating a tree involves deciding on which features to split the data and what conditions to use for splitting, as well as with knowing when to stop

In [23]:
import pandas as pd
import numpy as np
import json
import graphviz
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split

pd.set_option("display.max_rows",6)

%matplotlib inline

In [24]:
df_data = pd.read_csv(r'varsom_ml_preproc_3y.csv', index_col=0)

target_ = 'danger_level'
X = df_data.drop([target_, 'date'], axis=1)
y = df_data.filter([target_], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 222, test_size = 0.3)

In [25]:
print(df_data.columns.values)
df_data.describe()


['avalanche_problem_1_cause_id'
 'avalanche_problem_1_destructive_size_ext_id'
 'avalanche_problem_1_distribution_id'
 'avalanche_problem_1_exposed_height_1'
 'avalanche_problem_1_exposed_height_2' 'avalanche_problem_1_ext_id'
 'avalanche_problem_1_probability_id' 'avalanche_problem_1_problem_id'
 'avalanche_problem_1_problem_type_id'
 'avalanche_problem_1_trigger_simple_id' 'avalanche_problem_1_type_id'
 'avalanche_problem_2_cause_id'
 'avalanche_problem_2_destructive_size_ext_id'
 'avalanche_problem_2_distribution_id'
 'avalanche_problem_2_exposed_height_1'
 'avalanche_problem_2_exposed_height_2' 'avalanche_problem_2_ext_id'
 'avalanche_problem_2_probability_id' 'avalanche_problem_2_problem_id'
 'avalanche_problem_2_problem_type_id'
 'avalanche_problem_2_trigger_simple_id' 'avalanche_problem_2_type_id'
 'avalanche_problem_3_cause_id'
 'avalanche_problem_3_destructive_size_ext_id'
 'avalanche_problem_3_distribution_id'
 'avalanche_problem_3_exposed_height_1'
 'avalanche_problem_3_exposed_height_2' 'avalanche_problem_3_ext_id'
 'avalanche_problem_3_probability_id' 'avalanche_problem_3_problem_id'
 'avalanche_problem_3_problem_type_id'
 'avalanche_problem_3_trigger_simple_id' 'avalanche_problem_3_type_id'
 'danger_level' 'mountain_weather_freezing_level'
 'mountain_weather_precip_most_exposed' 'mountain_weather_precip_region'
 'mountain_weather_temperature_elevation'
 'mountain_weather_temperature_max' 'mountain_weather_temperature_min'
 'region_id' 'region_type_id' 'date' 'danger_level_prev1day'
 'danger_level_prev2day' 'danger_level_prev3day'
 'avalanche_problem_1_cause_id_prev1day'
 'avalanche_problem_1_problem_type_id_prev1day'
 'avalanche_problem_1_cause_id_prev2day'
 'avalanche_problem_1_problem_type_id_prev2day'
 'avalanche_problem_1_cause_id_prev3day'
 'avalanche_problem_1_problem_type_id_prev3day'
 'avalanche_problem_2_cause_id_prev1day'
 'avalanche_problem_2_problem_type_id_prev1day'
 'avalanche_problem_2_cause_id_prev2day'
 'avalanche_problem_2_problem_type_id_prev2day'
 'avalanche_problem_2_cause_id_prev3day'
 'avalanche_problem_2_problem_type_id_prev3day'
 'mountain_weather_precip_region_prev1day'
 'mountain_weather_precip_most_exposed_prev1day'
 'mountain_weather_precip_region_prev3daysum'
 'mountain_weather_wind_speed_num' 'mountain_weather_wind_direction_num'
 'avalanche_problem_1_problem_type_id_class'
 'avalanche_problem_1_sensitivity_id_class'
 'avalanche_problem_1_trigger_simple_id_class'
 'avalanche_problem_2_problem_type_id_class'
 'avalanche_problem_2_sensitivity_id_class'
 'avalanche_problem_2_trigger_simple_id_class'
 'avalanche_problem_3_problem_type_id_class'
 'avalanche_problem_3_sensitivity_id_class'
 'avalanche_problem_3_trigger_simple_id_class' 'region_group_id'
 'aval_problem_1_combined' 'emergency_warning_Ikke gitt'
 'emergency_warning_Naturlig utløste skred' 'author_Andreas@nve'
 'author_Eldbjorg@MET' 'author_Espen Granan' 'author_EspenN'
 'author_Halvor@NVE' 'author_HåvardT@met' 'author_Ida@met'
 'author_Ingrid@NVE' 'author_John Smits' 'author_JonasD@ObsKorps'
 'author_Julie@SVV' 'author_Jørgen@obskorps' 'author_Karsten@NVE'
 'author_MSA@nortind' 'author_Matilda@MET' 'author_Odd-Arne@NVE'
 'author_Ragnar@NVE' 'author_Ronny@NVE' 'author_Silje@svv'
 'author_Tommy@NVE' 'author_ToreV@met' 'author_anitaaw@met'
 'author_emma@nve' 'author_haso@nve.no' 'author_heidi@nve.no'
 'author_jan arild@obskorps' 'author_jegu@NVE' 'author_jostein@nve'
 'author_knutinge@svv' 'author_magnush@met' 'author_martin@svv'
 'author_ragnhildn@met' 'author_rue@nve' 'author_siri@met'
 'author_solveig@NVE' 'author_torehum@svv' 'author_torolav@obskorps'
 'mountain_weather_wind_direction_E' 'mountain_weather_wind_direction_N'
 'mountain_weather_wind_direction_NE' 'mountain_weather_wind_direction_NW'
 'mountain_weather_wind_direction_None'
 'mountain_weather_wind_direction_Not given'
 'mountain_weather_wind_direction_S' 'mountain_weather_wind_direction_SE'
 'mountain_weather_wind_direction_SW' 'mountain_weather_wind_direction_W']
Out[25]:
avalanche_problem_1_cause_id avalanche_problem_1_destructive_size_ext_id avalanche_problem_1_distribution_id avalanche_problem_1_exposed_height_1 avalanche_problem_1_exposed_height_2 avalanche_problem_1_ext_id avalanche_problem_1_probability_id avalanche_problem_1_problem_id avalanche_problem_1_problem_type_id avalanche_problem_1_trigger_simple_id ... mountain_weather_wind_direction_E mountain_weather_wind_direction_N mountain_weather_wind_direction_NE mountain_weather_wind_direction_NW mountain_weather_wind_direction_None mountain_weather_wind_direction_Not given mountain_weather_wind_direction_S mountain_weather_wind_direction_SE mountain_weather_wind_direction_SW mountain_weather_wind_direction_W
count 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 ... 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000 33264.000000
mean 7.504990 1.160955 1.012145 335.918711 34.704185 10.563672 1.642496 0.536195 7.453523 10.168951 ... 0.027718 0.012205 0.013949 0.027898 0.004630 0.690536 0.034392 0.097042 0.051166 0.040464
std 7.876866 1.170598 1.026995 401.052501 150.560137 9.951452 1.561384 0.498696 10.807930 10.027306 ... 0.164165 0.109803 0.117281 0.164683 0.067885 0.462280 0.182235 0.296019 0.220340 0.197048
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
50% 10.000000 1.000000 1.000000 200.000000 0.000000 15.000000 3.000000 1.000000 5.000000 10.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
75% 13.000000 2.000000 2.000000 600.000000 0.000000 20.000000 3.000000 1.000000 10.000000 21.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
max 24.000000 4.000000 4.000000 2100.000000 2000.000000 25.000000 5.000000 1.000000 50.000000 22.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 122 columns


In [ ]:


In [26]:
print(df_data[df_data['danger_level']>3]['region_id'].unique())


[3003 3007 3009 3010 3011 3012 3013 3014 3015 3016 3017 3022 3023 3024
 3027 3029 3031 3034]

RANDOM FORESTS

  • are using Bagging (bootstrap aggregating) algorithm

  • bagging is an ensemble learning where we build each model using the same algorithm, but we train each learner on different set of data

GRADIENT BOOSTED TREES

  • are using Boosting (Ada Boost) algorithm

  • boosting is a variation of ensemble trees that strives to improve the learners by focusing on areas where the system is not performing well.

We can start by creating one decision tree of depth 3 using all features


In [27]:
dec_tree = tree.DecisionTreeRegressor(random_state=222, max_depth = 3)

dec_tree.fit(X_train, y_train) # we're using the same data as in last linear model

predictions_dt = dec_tree.predict(X_test)
print(predictions_dt.shape, y_test.shape)


(9980,) (9980, 1)

In [34]:
# Visualize the tree
dot_data = tree.export_graphviz(dec_tree, out_file=None, 
                         feature_names=df_data.drop([target_, 'date'], axis=1).columns,   
                         filled=True, rounded=True,  
                         special_characters=True)  

graph = graphviz.Source(dot_data) 
graph.render('aval_danger_by_problem_dt_lev7')
graph


Out[34]:
Tree 0 avalanche_problem_1_destructive_size_ext_id ≤ 1.5 mse = 1.4041 samples = 23284 value = 1.1395 1 avalanche_problem_1_ext_id ≤ 5.0 mse = 0.1303 samples = 12228 value = 0.1243 0->1 True 48 avalanche_problem_1_destructive_size_ext_id ≤ 2.5 mse = 0.4124 samples = 11056 value = 2.2623 0->48 False 2 mountain_weather_wind_direction_num ≤ 4.5 mse = 0.0026 samples = 10861 value = 0.0022 1->2 15 avalanche_problem_2_problem_type_id_class ≤ 4.5 mse = 0.0855 samples = 1367 value = 1.0944 1->15 3 author_Tommy@NVE ≤ 0.5 mse = 0.0009 samples = 10847 value = 0.0009 2->3 10 avalanche_problem_2_trigger_simple_id_class ≤ 1.0 mse = 0.2857 samples = 14 value = 1.0 2->10 4 mountain_weather_temperature_min ≤ -11.0 mse = 0.0004 samples = 10841 value = 0.0004 3->4 9 mse = 0.0 samples = 6 value = 1.0 3->9 5 mse = 0.0 samples = 2 value = 1.0 4->5 6 author_heidi@nve.no ≤ 0.5 mse = 0.0002 samples = 10839 value = 0.0002 4->6 7 mse = 0.0 samples = 10837 value = 0.0 6->7 8 mse = 0.0 samples = 2 value = 1.0 6->8 11 mountain_weather_wind_speed_num ≤ 4.5 mse = 0.1389 samples = 12 value = 0.8333 10->11 14 mse = 0.0 samples = 2 value = 2.0 10->14 12 mse = 0.0 samples = 2 value = 0.0 11->12 13 mse = 0.0 samples = 10 value = 1.0 11->13 16 avalanche_problem_1_destructive_size_ext_id ≤ 0.5 mse = 0.0368 samples = 1229 value = 1.0382 15->16 27 danger_level_prev1day ≤ 1.5 mse = 0.2411 samples = 138 value = 1.5942 15->27 17 mse = 0.0 samples = 11 value = 2.0 16->17 18 avalanche_problem_2_destructive_size_ext_id ≤ 2.5 mse = 0.0287 samples = 1218 value = 1.0296 16->18 19 aval_problem_1_combined ≤ 7181.0 mse = 0.0211 samples = 1208 value = 1.0215 18->19 26 mse = 0.0 samples = 10 value = 2.0 18->26 20 region_group_id ≤ 0.5 mse = 0.018 samples = 1202 value = 1.0183 19->20 23 avalanche_problem_2_problem_type_id_prev1day ≤ 15.0 mse = 0.2222 samples = 6 value = 1.6667 19->23 21 mse = 0.0 samples = 2 value = 2.0 20->21 22 mse = 0.0164 samples = 1200 value = 1.0167 20->22 24 mse = 0.0 samples = 4 value = 2.0 23->24 25 mse = 0.0 samples = 2 value = 1.0 23->25 28 avalanche_problem_1_problem_type_id_prev3day ≤ 2.5 mse = 0.0794 samples = 46 value = 1.087 27->28 35 avalanche_problem_1_problem_type_id_prev3day ≤ 8.5 mse = 0.129 samples = 92 value = 1.8478 27->35 29 mse = 0.0 samples = 2 value = 2.0 28->29 30 mountain_weather_temperature_max ≤ -14.5 mse = 0.0434 samples = 44 value = 1.0455 28->30 31 mse = 0.0 samples = 1 value = 2.0 30->31 32 avalanche_problem_1_distribution_id ≤ 2.5 mse = 0.0227 samples = 43 value = 1.0233 30->32 33 mse = 0.0 samples = 42 value = 1.0 32->33 34 mse = 0.0 samples = 1 value = 2.0 32->34 36 avalanche_problem_2_cause_id ≤ 12.0 mse = 0.2495 samples = 23 value = 1.5217 35->36 41 mountain_weather_temperature_elevation ≤ 1600.0 mse = 0.0416 samples = 69 value = 1.9565 35->41 37 mse = 0.0 samples = 9 value = 1.0 36->37 38 author_jan arild@obskorps ≤ 0.5 mse = 0.1224 samples = 14 value = 1.8571 36->38 39 mse = 0.0 samples = 12 value = 2.0 38->39 40 mse = 0.0 samples = 2 value = 1.0 38->40 42 aval_problem_1_combined ≤ 6230.5 mse = 0.0156 samples = 63 value = 1.9841 41->42 45 mountain_weather_precip_region_prev1day ≤ 3.5 mse = 0.2222 samples = 6 value = 1.6667 41->45 43 mse = 0.0 samples = 58 value = 2.0 42->43 44 mse = 0.16 samples = 5 value = 1.8 42->44 46 mse = 0.0 samples = 2 value = 1.0 45->46 47 mse = 0.0 samples = 4 value = 2.0 45->47 49 avalanche_problem_1_distribution_id ≤ 2.5 mse = 0.2644 samples = 7768 value = 2.0064 48->49 100 avalanche_problem_1_trigger_simple_id_class ≤ 1.5 mse = 0.242 samples = 3288 value = 2.8668 48->100 50 avalanche_problem_1_distribution_id ≤ 1.5 mse = 0.1717 samples = 6945 value = 1.8979 49->50 81 aval_problem_1_combined ≤ 2357.0 mse = 0.1082 samples = 823 value = 2.9222 49->81 51 danger_level_prev1day ≤ 1.5 mse = 0.2346 samples = 1460 value = 1.3705 50->51 66 avalanche_problem_1_probability_id ≤ 4.0 mse = 0.0613 samples = 5485 value = 2.0383 50->66 52 mountain_weather_precip_most_exposed ≤ 7.5 mse = 0.1071 samples = 784 value = 1.1186 51->52 59 avalanche_problem_1_sensitivity_id_class ≤ 2.5 mse = 0.2235 samples = 676 value = 1.6627 51->59 53 author_Ronny@NVE ≤ 0.5 mse = 0.0604 samples = 717 value = 1.0614 52->53 56 avalanche_problem_1_cause_id_prev2day ≤ 16.5 mse = 0.1965 samples = 67 value = 1.7313 52->56 54 mse = 0.0517 samples = 713 value = 1.0547 53->54 55 mse = 0.1875 samples = 4 value = 2.25 53->55 57 mse = 0.0916 samples = 49 value = 1.898 56->57 58 mse = 0.2006 samples = 18 value = 1.2778 56->58 60 avalanche_problem_2_problem_type_id_class ≤ 6.5 mse = 0.1684 samples = 154 value = 1.2143 59->60 63 mountain_weather_temperature_min ≤ -2.5 mse = 0.163 samples = 522 value = 1.795 59->63 61 mse = 0.0416 samples = 115 value = 1.0435 60->61 62 mse = 0.2025 samples = 39 value = 1.7179 60->62 64 mse = 0.009 samples = 332 value = 1.991 63->64 65 mse = 0.2478 samples = 190 value = 1.4526 63->65 67 avalanche_problem_1_probability_id ≤ 2.5 mse = 0.0387 samples = 5297 value = 2.0136 66->67 74 avalanche_problem_1_problem_type_id_class ≤ 3.0 mse = 0.1952 samples = 188 value = 2.734 66->74 68 mountain_weather_wind_direction_num ≤ 2.5 mse = 0.2954 samples = 78 value = 1.4231 67->68 71 emergency_warning_Ikke gitt ≤ 0.5 mse = 0.0296 samples = 5219 value = 2.0224 67->71 69 mse = 0.1956 samples = 30 value = 1.9333 68->69 70 mse = 0.0933 samples = 48 value = 1.1042 68->70 72 mse = 0.0 samples = 12 value = 3.0 71->72 73 mse = 0.0274 samples = 5207 value = 2.0202 71->73 75 emergency_warning_Naturlig utløste skred ≤ 0.5 mse = 0.1262 samples = 54 value = 2.1481 74->75 78 author_anitaaw@met ≤ 0.5 mse = 0.029 samples = 134 value = 2.9701 74->78 76 mse = 0.0399 samples = 48 value = 2.0417 75->76 77 mse = 0.0 samples = 6 value = 3.0 75->77 79 mse = 0.0149 samples = 132 value = 2.9848 78->79 80 mse = 0.0 samples = 2 value = 2.0 78->80 82 mountain_weather_temperature_elevation ≤ 1250.0 mse = 0.2439 samples = 83 value = 2.4217 81->82 91 avalanche_problem_1_trigger_simple_id ≤ 15.5 mse = 0.0617 samples = 740 value = 2.9784 81->91 83 avalanche_problem_2_distribution_id ≤ 1.5 mse = 0.0227 samples = 43 value = 2.0233 82->83 86 avalanche_problem_2_trigger_simple_id_class ≤ 1.5 mse = 0.1275 samples = 40 value = 2.85 82->86 84 mse = 0.0 samples = 42 value = 2.0 83->84 85 mse = 0.0 samples = 1 value = 3.0 83->85 87 avalanche_problem_2_distribution_id ≤ 1.0 mse = 0.1875 samples = 8 value = 2.25 86->87 90 mse = 0.0 samples = 32 value = 3.0 86->90 88 mse = 0.0 samples = 2 value = 3.0 87->88 89 mse = 0.0 samples = 6 value = 2.0 87->89 92 mse = 0.0 samples = 18 value = 2.0 91->92 93 mountain_weather_precip_most_exposed ≤ 27.5 mse = 0.0388 samples = 722 value = 3.0028 91->93 94 author_Ronny@NVE ≤ 0.5 mse = 0.02 samples = 688 value = 2.9826 93->94 97 avalanche_problem_1_cause_id ≤ 14.0 mse = 0.2422 samples = 34 value = 3.4118 93->97 95 mse = 0.0134 samples = 664 value = 2.9895 94->95 96 mse = 0.1649 samples = 24 value = 2.7917 94->96 98 mse = 0.0 samples = 20 value = 3.0 97->98 99 mse = 0.0 samples = 14 value = 4.0 97->99 101 aval_problem_1_combined ≤ 7223.5 mse = 0.1556 samples = 445 value = 2.0921 100->101 112 avalanche_problem_1_distribution_id ≤ 1.5 mse = 0.1469 samples = 2843 value = 2.988 100->112 102 danger_level_prev1day ≤ 1.5 mse = 0.0823 samples = 401 value = 2.0025 101->102 109 danger_level_prev1day ≤ 2.5 mse = 0.0826 samples = 44 value = 2.9091 101->109 103 mse = 0.0 samples = 15 value = 1.0 102->103 104 avalanche_problem_1_probability_id ≤ 4.0 mse = 0.0449 samples = 386 value = 2.0415 102->104 105 mountain_weather_precip_region_prev1day ≤ 2.5 mse = 0.0333 samples = 381 value = 2.0289 104->105 108 mse = 0.0 samples = 5 value = 3.0 104->108 106 mse = 0.0115 samples = 347 value = 2.0058 105->106 107 mse = 0.1946 samples = 34 value = 2.2647 105->107 110 mse = 0.0 samples = 4 value = 2.0 109->110 111 mse = 0.0 samples = 40 value = 3.0 109->111 113 emergency_warning_Ikke gitt ≤ 0.5 mse = 0.1653 samples = 268 value = 2.209 112->113 120 avalanche_problem_1_sensitivity_id_class ≤ 5.5 mse = 0.0752 samples = 2575 value = 3.0691 112->120 114 mse = 0.0 samples = 35 value = 3.0 113->114 115 avalanche_problem_1_destructive_size_ext_id ≤ 3.5 mse = 0.082 samples = 233 value = 2.0901 113->115 116 mountain_weather_temperature_max ≤ -9.5 mse = 0.043 samples = 222 value = 2.045 115->116 119 mse = 0.0 samples = 11 value = 3.0 115->119 117 mse = 0.1875 samples = 8 value = 2.75 116->117 118 mse = 0.0183 samples = 214 value = 2.0187 116->118 121 avalanche_problem_1_destructive_size_ext_id ≤ 3.5 mse = 0.0343 samples = 2383 value = 3.0231 120->121 128 avalanche_problem_1_distribution_id ≤ 2.5 mse = 0.2302 samples = 192 value = 3.6406 120->128 122 aval_problem_1_combined ≤ 7343.0 mse = 0.0233 samples = 2348 value = 3.0115 121->122 125 author_ragnhildn@met ≤ 0.5 mse = 0.16 samples = 35 value = 3.8 121->125 123 mse = 0.0167 samples = 2328 value = 3.0047 122->123 124 mse = 0.16 samples = 20 value = 3.8 122->124 126 mse = 0.0622 samples = 30 value = 3.9333 125->126 127 mse = 0.0 samples = 5 value = 3.0 125->127 129 avalanche_problem_1_destructive_size_ext_id ≤ 3.5 mse = 0.1956 samples = 90 value = 3.2667 128->129 132 mountain_weather_wind_direction_NE ≤ 0.5 mse = 0.0285 samples = 102 value = 3.9706 128->132 130 mse = 0.1302 samples = 78 value = 3.1538 129->130 131 mse = 0.0 samples = 12 value = 4.0 129->131 133 mse = 0.0099 samples = 100 value = 3.99 132->133 134 mse = 0.0 samples = 2 value = 3.0 132->134

The first avalanche problem dictates the danger level - that was expected :-)

We need to evaluate our model:


In [35]:
print('Decision tree R^2: %.4f' % dec_tree.score(X_test, y_test))


Decision tree R^2: 0.9863

In [36]:
depths = range(1, 20)

tree_models = [tree.DecisionTreeRegressor(random_state=222, max_depth=d).fit(X_train, y_train) for d in depths]
tree_Rsquare = [f.score(X_test, y_test) for f in tree_models]

plt.plot(depths, tree_Rsquare, color = 'red')
plt.xlabel('Tree depth')
plt.ylabel('$R^2$')


Out[36]:
Text(0,0.5,'$R^2$')

Looks like there is little gain when using a depth > 7.


In [37]:
# so let's create a tree with depth = 7

dec_tree = tree.DecisionTreeRegressor(random_state=222, max_depth = 7)

dec_tree.fit(X_train, y_train) # we're using the same data as in last linear model

predictions_dt = dec_tree.predict(X_test)

In [38]:
# Visualize the tree
dot_data = tree.export_graphviz(dec_tree, out_file=None, 
                         feature_names=df_data.drop([target_, 'date'], axis=1).columns,   
                         filled=True, rounded=True,  
                         special_characters=True)  

graph = graphviz.Source(dot_data) 
graph.render('aval_danger_by_problem_dt2')
graph


Out[38]:
Tree 0 avalanche_problem_1_destructive_size_ext_id ≤ 1.5 mse = 1.4041 samples = 23284 value = 1.1395 1 avalanche_problem_1_ext_id ≤ 5.0 mse = 0.1303 samples = 12228 value = 0.1243 0->1 True 48 avalanche_problem_1_destructive_size_ext_id ≤ 2.5 mse = 0.4124 samples = 11056 value = 2.2623 0->48 False 2 mountain_weather_wind_direction_num ≤ 4.5 mse = 0.0026 samples = 10861 value = 0.0022 1->2 15 avalanche_problem_2_problem_type_id_class ≤ 4.5 mse = 0.0855 samples = 1367 value = 1.0944 1->15 3 author_Tommy@NVE ≤ 0.5 mse = 0.0009 samples = 10847 value = 0.0009 2->3 10 avalanche_problem_2_trigger_simple_id_class ≤ 1.0 mse = 0.2857 samples = 14 value = 1.0 2->10 4 mountain_weather_temperature_min ≤ -11.0 mse = 0.0004 samples = 10841 value = 0.0004 3->4 9 mse = 0.0 samples = 6 value = 1.0 3->9 5 mse = 0.0 samples = 2 value = 1.0 4->5 6 author_heidi@nve.no ≤ 0.5 mse = 0.0002 samples = 10839 value = 0.0002 4->6 7 mse = 0.0 samples = 10837 value = 0.0 6->7 8 mse = 0.0 samples = 2 value = 1.0 6->8 11 mountain_weather_wind_speed_num ≤ 4.5 mse = 0.1389 samples = 12 value = 0.8333 10->11 14 mse = 0.0 samples = 2 value = 2.0 10->14 12 mse = 0.0 samples = 2 value = 0.0 11->12 13 mse = 0.0 samples = 10 value = 1.0 11->13 16 avalanche_problem_1_destructive_size_ext_id ≤ 0.5 mse = 0.0368 samples = 1229 value = 1.0382 15->16 27 danger_level_prev1day ≤ 1.5 mse = 0.2411 samples = 138 value = 1.5942 15->27 17 mse = 0.0 samples = 11 value = 2.0 16->17 18 avalanche_problem_2_destructive_size_ext_id ≤ 2.5 mse = 0.0287 samples = 1218 value = 1.0296 16->18 19 aval_problem_1_combined ≤ 7181.0 mse = 0.0211 samples = 1208 value = 1.0215 18->19 26 mse = 0.0 samples = 10 value = 2.0 18->26 20 region_group_id ≤ 0.5 mse = 0.018 samples = 1202 value = 1.0183 19->20 23 avalanche_problem_2_problem_type_id_prev1day ≤ 15.0 mse = 0.2222 samples = 6 value = 1.6667 19->23 21 mse = 0.0 samples = 2 value = 2.0 20->21 22 mse = 0.0164 samples = 1200 value = 1.0167 20->22 24 mse = 0.0 samples = 4 value = 2.0 23->24 25 mse = 0.0 samples = 2 value = 1.0 23->25 28 avalanche_problem_1_problem_type_id_prev3day ≤ 2.5 mse = 0.0794 samples = 46 value = 1.087 27->28 35 avalanche_problem_1_problem_type_id_prev3day ≤ 8.5 mse = 0.129 samples = 92 value = 1.8478 27->35 29 mse = 0.0 samples = 2 value = 2.0 28->29 30 mountain_weather_temperature_max ≤ -14.5 mse = 0.0434 samples = 44 value = 1.0455 28->30 31 mse = 0.0 samples = 1 value = 2.0 30->31 32 avalanche_problem_1_distribution_id ≤ 2.5 mse = 0.0227 samples = 43 value = 1.0233 30->32 33 mse = 0.0 samples = 42 value = 1.0 32->33 34 mse = 0.0 samples = 1 value = 2.0 32->34 36 avalanche_problem_2_cause_id ≤ 12.0 mse = 0.2495 samples = 23 value = 1.5217 35->36 41 mountain_weather_temperature_elevation ≤ 1600.0 mse = 0.0416 samples = 69 value = 1.9565 35->41 37 mse = 0.0 samples = 9 value = 1.0 36->37 38 author_jan arild@obskorps ≤ 0.5 mse = 0.1224 samples = 14 value = 1.8571 36->38 39 mse = 0.0 samples = 12 value = 2.0 38->39 40 mse = 0.0 samples = 2 value = 1.0 38->40 42 aval_problem_1_combined ≤ 6230.5 mse = 0.0156 samples = 63 value = 1.9841 41->42 45 mountain_weather_precip_region_prev1day ≤ 3.5 mse = 0.2222 samples = 6 value = 1.6667 41->45 43 mse = 0.0 samples = 58 value = 2.0 42->43 44 mse = 0.16 samples = 5 value = 1.8 42->44 46 mse = 0.0 samples = 2 value = 1.0 45->46 47 mse = 0.0 samples = 4 value = 2.0 45->47 49 avalanche_problem_1_distribution_id ≤ 2.5 mse = 0.2644 samples = 7768 value = 2.0064 48->49 100 avalanche_problem_1_trigger_simple_id_class ≤ 1.5 mse = 0.242 samples = 3288 value = 2.8668 48->100 50 avalanche_problem_1_distribution_id ≤ 1.5 mse = 0.1717 samples = 6945 value = 1.8979 49->50 81 aval_problem_1_combined ≤ 2357.0 mse = 0.1082 samples = 823 value = 2.9222 49->81 51 danger_level_prev1day ≤ 1.5 mse = 0.2346 samples = 1460 value = 1.3705 50->51 66 avalanche_problem_1_probability_id ≤ 4.0 mse = 0.0613 samples = 5485 value = 2.0383 50->66 52 mountain_weather_precip_most_exposed ≤ 7.5 mse = 0.1071 samples = 784 value = 1.1186 51->52 59 avalanche_problem_1_sensitivity_id_class ≤ 2.5 mse = 0.2235 samples = 676 value = 1.6627 51->59 53 author_Ronny@NVE ≤ 0.5 mse = 0.0604 samples = 717 value = 1.0614 52->53 56 avalanche_problem_1_cause_id_prev2day ≤ 16.5 mse = 0.1965 samples = 67 value = 1.7313 52->56 54 mse = 0.0517 samples = 713 value = 1.0547 53->54 55 mse = 0.1875 samples = 4 value = 2.25 53->55 57 mse = 0.0916 samples = 49 value = 1.898 56->57 58 mse = 0.2006 samples = 18 value = 1.2778 56->58 60 avalanche_problem_2_problem_type_id_class ≤ 6.5 mse = 0.1684 samples = 154 value = 1.2143 59->60 63 mountain_weather_temperature_min ≤ -2.5 mse = 0.163 samples = 522 value = 1.795 59->63 61 mse = 0.0416 samples = 115 value = 1.0435 60->61 62 mse = 0.2025 samples = 39 value = 1.7179 60->62 64 mse = 0.009 samples = 332 value = 1.991 63->64 65 mse = 0.2478 samples = 190 value = 1.4526 63->65 67 avalanche_problem_1_probability_id ≤ 2.5 mse = 0.0387 samples = 5297 value = 2.0136 66->67 74 avalanche_problem_1_problem_type_id_class ≤ 3.0 mse = 0.1952 samples = 188 value = 2.734 66->74 68 mountain_weather_wind_direction_num ≤ 2.5 mse = 0.2954 samples = 78 value = 1.4231 67->68 71 emergency_warning_Ikke gitt ≤ 0.5 mse = 0.0296 samples = 5219 value = 2.0224 67->71 69 mse = 0.1956 samples = 30 value = 1.9333 68->69 70 mse = 0.0933 samples = 48 value = 1.1042 68->70 72 mse = 0.0 samples = 12 value = 3.0 71->72 73 mse = 0.0274 samples = 5207 value = 2.0202 71->73 75 emergency_warning_Naturlig utløste skred ≤ 0.5 mse = 0.1262 samples = 54 value = 2.1481 74->75 78 author_anitaaw@met ≤ 0.5 mse = 0.029 samples = 134 value = 2.9701 74->78 76 mse = 0.0399 samples = 48 value = 2.0417 75->76 77 mse = 0.0 samples = 6 value = 3.0 75->77 79 mse = 0.0149 samples = 132 value = 2.9848 78->79 80 mse = 0.0 samples = 2 value = 2.0 78->80 82 mountain_weather_temperature_elevation ≤ 1250.0 mse = 0.2439 samples = 83 value = 2.4217 81->82 91 avalanche_problem_1_trigger_simple_id ≤ 15.5 mse = 0.0617 samples = 740 value = 2.9784 81->91 83 avalanche_problem_2_distribution_id ≤ 1.5 mse = 0.0227 samples = 43 value = 2.0233 82->83 86 avalanche_problem_2_trigger_simple_id_class ≤ 1.5 mse = 0.1275 samples = 40 value = 2.85 82->86 84 mse = 0.0 samples = 42 value = 2.0 83->84 85 mse = 0.0 samples = 1 value = 3.0 83->85 87 avalanche_problem_2_distribution_id ≤ 1.0 mse = 0.1875 samples = 8 value = 2.25 86->87 90 mse = 0.0 samples = 32 value = 3.0 86->90 88 mse = 0.0 samples = 2 value = 3.0 87->88 89 mse = 0.0 samples = 6 value = 2.0 87->89 92 mse = 0.0 samples = 18 value = 2.0 91->92 93 mountain_weather_precip_most_exposed ≤ 27.5 mse = 0.0388 samples = 722 value = 3.0028 91->93 94 author_Ronny@NVE ≤ 0.5 mse = 0.02 samples = 688 value = 2.9826 93->94 97 avalanche_problem_1_cause_id ≤ 14.0 mse = 0.2422 samples = 34 value = 3.4118 93->97 95 mse = 0.0134 samples = 664 value = 2.9895 94->95 96 mse = 0.1649 samples = 24 value = 2.7917 94->96 98 mse = 0.0 samples = 20 value = 3.0 97->98 99 mse = 0.0 samples = 14 value = 4.0 97->99 101 aval_problem_1_combined ≤ 7223.5 mse = 0.1556 samples = 445 value = 2.0921 100->101 112 avalanche_problem_1_distribution_id ≤ 1.5 mse = 0.1469 samples = 2843 value = 2.988 100->112 102 danger_level_prev1day ≤ 1.5 mse = 0.0823 samples = 401 value = 2.0025 101->102 109 danger_level_prev1day ≤ 2.5 mse = 0.0826 samples = 44 value = 2.9091 101->109 103 mse = 0.0 samples = 15 value = 1.0 102->103 104 avalanche_problem_1_probability_id ≤ 4.0 mse = 0.0449 samples = 386 value = 2.0415 102->104 105 mountain_weather_precip_region_prev1day ≤ 2.5 mse = 0.0333 samples = 381 value = 2.0289 104->105 108 mse = 0.0 samples = 5 value = 3.0 104->108 106 mse = 0.0115 samples = 347 value = 2.0058 105->106 107 mse = 0.1946 samples = 34 value = 2.2647 105->107 110 mse = 0.0 samples = 4 value = 2.0 109->110 111 mse = 0.0 samples = 40 value = 3.0 109->111 113 emergency_warning_Ikke gitt ≤ 0.5 mse = 0.1653 samples = 268 value = 2.209 112->113 120 avalanche_problem_1_sensitivity_id_class ≤ 5.5 mse = 0.0752 samples = 2575 value = 3.0691 112->120 114 mse = 0.0 samples = 35 value = 3.0 113->114 115 avalanche_problem_1_destructive_size_ext_id ≤ 3.5 mse = 0.082 samples = 233 value = 2.0901 113->115 116 mountain_weather_temperature_max ≤ -9.5 mse = 0.043 samples = 222 value = 2.045 115->116 119 mse = 0.0 samples = 11 value = 3.0 115->119 117 mse = 0.1875 samples = 8 value = 2.75 116->117 118 mse = 0.0183 samples = 214 value = 2.0187 116->118 121 avalanche_problem_1_destructive_size_ext_id ≤ 3.5 mse = 0.0343 samples = 2383 value = 3.0231 120->121 128 avalanche_problem_1_distribution_id ≤ 2.5 mse = 0.2302 samples = 192 value = 3.6406 120->128 122 aval_problem_1_combined ≤ 7343.0 mse = 0.0233 samples = 2348 value = 3.0115 121->122 125 author_ragnhildn@met ≤ 0.5 mse = 0.16 samples = 35 value = 3.8 121->125 123 mse = 0.0167 samples = 2328 value = 3.0047 122->123 124 mse = 0.16 samples = 20 value = 3.8 122->124 126 mse = 0.0622 samples = 30 value = 3.9333 125->126 127 mse = 0.0 samples = 5 value = 3.0 125->127 129 avalanche_problem_1_destructive_size_ext_id ≤ 3.5 mse = 0.1956 samples = 90 value = 3.2667 128->129 132 mountain_weather_wind_direction_NE ≤ 0.5 mse = 0.0285 samples = 102 value = 3.9706 128->132 130 mse = 0.1302 samples = 78 value = 3.1538 129->130 131 mse = 0.0 samples = 12 value = 4.0 129->131 133 mse = 0.0099 samples = 100 value = 3.99 132->133 134 mse = 0.0 samples = 2 value = 3.0 132->134

In [39]:
print('Decision tree R^2: %.4f' % dec_tree.score(X_test, y_test))


Decision tree R^2: 0.9863

Now we see slight improvement in $R^{2}$.

Maybe we'll get better results with random forests and/or gradient boosted trees.

Random forest


In [40]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state = 422, max_depth = 7)
forest_reg.fit(X_train, y_train)

predictions_rf = forest_reg.predict(X_test)


C:\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d
C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:4: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

In [41]:
print('Random forest R^2: %.4f' % forest_reg.score(X_test, y_test))


Random forest R^2: 0.9876

Gradient boosting


In [42]:
from sklearn.ensemble import GradientBoostingRegressor

grad_boost = GradientBoostingRegressor(random_state = 222, max_depth = 6)

grad_boost.fit(X_train, y_train)

predictions_gb = grad_boost.predict(X_test)

print('Gradient boosted tree R^2: %.4f' % grad_boost.score(X_test, y_test))


C:\Anaconda3\lib\site-packages\sklearn\utils\validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Gradient boosted tree R^2: 0.9941

Again, very small difference between random forests and boosted trees.

Last thing we can check is imprtance of variables - if some of the feature are not as useful as the other in explaining the variability in our target variable, we cn exclude them in order to simplify our model.


In [43]:
feature_labels = np.array(df_data.drop([target_, 'date'], axis=1).columns)
#feature_labels = np.array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'])
importance = forest_reg.feature_importances_
feature_indexes_by_importance = importance.argsort()
for index in feature_indexes_by_importance:
    print('{}-{:.2f}%'.format(feature_labels[index], (importance[index] *100.0)))


mountain_weather_wind_direction_W-0.00%
avalanche_problem_3_trigger_simple_id-0.00%
avalanche_problem_3_type_id-0.00%
author_torolav@obskorps-0.00%
region_type_id-0.00%
author_solveig@NVE-0.00%
avalanche_problem_2_cause_id_prev3day-0.00%
avalanche_problem_2_problem_type_id_prev3day-0.00%
author_martin@svv-0.00%
avalanche_problem_3_trigger_simple_id_class-0.00%
author_magnush@met-0.00%
author_knutinge@svv-0.00%
avalanche_problem_3_problem_id-0.00%
author_Andreas@nve-0.00%
author_EspenN-0.00%
author_HåvardT@met-0.00%
author_Ida@met-0.00%
author_Ingrid@NVE-0.00%
author_John Smits-0.00%
author_Julie@SVV-0.00%
author_Karsten@NVE-0.00%
author_MSA@nortind-0.00%
author_Ragnar@NVE-0.00%
author_ToreV@met-0.00%
author_haso@nve.no-0.00%
author_Espen Granan-0.00%
avalanche_problem_3_probability_id-0.00%
avalanche_problem_2_cause_id_prev1day-0.00%
avalanche_problem_3_exposed_height_2-0.00%
mountain_weather_wind_direction_SE-0.00%
mountain_weather_wind_direction_S-0.00%
mountain_weather_wind_direction_Not given-0.00%
mountain_weather_wind_direction_None-0.00%
mountain_weather_wind_direction_NW-0.00%
avalanche_problem_1_type_id-0.00%
avalanche_problem_2_exposed_height_2-0.00%
avalanche_problem_3_ext_id-0.00%
avalanche_problem_2_problem_id-0.00%
avalanche_problem_2_problem_type_id-0.00%
avalanche_problem_2_type_id-0.00%
author_jostein@nve-0.00%
avalanche_problem_3_cause_id-0.00%
avalanche_problem_3_exposed_height_1-0.00%
author_emma@nve-0.00%
mountain_weather_wind_direction_N-0.00%
mountain_weather_wind_direction_SW-0.00%
avalanche_problem_2_ext_id-0.00%
mountain_weather_precip_region_prev3daysum-0.00%
author_Jørgen@obskorps-0.00%
mountain_weather_freezing_level-0.00%
avalanche_problem_2_cause_id_prev2day-0.00%
avalanche_problem_1_exposed_height_2-0.00%
avalanche_problem_1_problem_type_id_prev1day-0.00%
author_Halvor@NVE-0.00%
author_siri@met-0.00%
mountain_weather_wind_direction_E-0.00%
avalanche_problem_2_problem_type_id_prev1day-0.00%
author_JonasD@ObsKorps-0.00%
avalanche_problem_3_destructive_size_ext_id-0.00%
avalanche_problem_2_probability_id-0.00%
author_torehum@svv-0.00%
avalanche_problem_3_problem_type_id-0.00%
author_Odd-Arne@NVE-0.00%
author_Matilda@MET-0.00%
author_jegu@NVE-0.00%
author_Eldbjorg@MET-0.00%
author_rue@nve-0.00%
danger_level_prev2day-0.00%
author_jan arild@obskorps-0.00%
avalanche_problem_2_problem_type_id_prev2day-0.00%
avalanche_problem_3_sensitivity_id_class-0.00%
avalanche_problem_3_distribution_id-0.00%
avalanche_problem_1_cause_id_prev2day-0.00%
avalanche_problem_1_problem_type_id_prev2day-0.00%
avalanche_problem_2_trigger_simple_id_class-0.00%
author_ragnhildn@met-0.00%
avalanche_problem_2_trigger_simple_id-0.00%
author_Silje@svv-0.00%
avalanche_problem_2_sensitivity_id_class-0.00%
author_anitaaw@met-0.00%
mountain_weather_precip_most_exposed_prev1day-0.00%
avalanche_problem_2_distribution_id-0.00%
region_group_id-0.01%
avalanche_problem_3_problem_type_id_class-0.01%
avalanche_problem_1_problem_type_id_prev3day-0.01%
mountain_weather_precip_region_prev1day-0.01%
author_heidi@nve.no-0.01%
avalanche_problem_2_cause_id-0.01%
avalanche_problem_1_cause_id_prev1day-0.01%
mountain_weather_wind_direction_NE-0.01%
avalanche_problem_2_exposed_height_1-0.01%
mountain_weather_wind_speed_num-0.01%
avalanche_problem_1_cause_id_prev3day-0.01%
region_id-0.01%
author_Ronny@NVE-0.01%
danger_level_prev3day-0.02%
mountain_weather_precip_region-0.02%
avalanche_problem_1_exposed_height_1-0.02%
author_Tommy@NVE-0.02%
mountain_weather_temperature_max-0.03%
emergency_warning_Naturlig utløste skred-0.03%
mountain_weather_temperature_elevation-0.04%
mountain_weather_wind_direction_num-0.06%
avalanche_problem_2_destructive_size_ext_id-0.06%
emergency_warning_Ikke gitt-0.06%
mountain_weather_precip_most_exposed-0.08%
mountain_weather_temperature_min-0.13%
avalanche_problem_2_problem_type_id_class-0.16%
avalanche_problem_1_probability_id-0.35%
danger_level_prev1day-0.40%
avalanche_problem_1_trigger_simple_id-0.41%
avalanche_problem_1_problem_id-0.42%
avalanche_problem_1_cause_id-0.46%
avalanche_problem_1_problem_type_id_class-0.46%
avalanche_problem_1_problem_type_id-0.47%
avalanche_problem_1_trigger_simple_id_class-0.49%
avalanche_problem_1_sensitivity_id_class-0.53%
avalanche_problem_1_ext_id-0.88%
aval_problem_1_combined-1.55%
avalanche_problem_1_distribution_id-5.11%
avalanche_problem_1_destructive_size_ext_id-87.56%

In [44]:
# excluding the variables where impotance is less than 1%

X = df_data[['avalanche_problem_1_distribution_id',
             'avalanche_problem_1_destructive_size_ext_id',
             'avalanche_problem_1_trigger_simple_id',
             'avalanche_problem_1_probability_id',
             'avalanche_problem_2_destructive_size_ext_id',
             'avalanche_problem_2_probability_id']]
y = df_data['danger_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 222)

In [45]:
grad_boost = GradientBoostingRegressor(random_state = 222, max_depth = 7)

grad_boost.fit(X_train, y_train)

predictions_gb = grad_boost.predict(X_test)

In [46]:
print('Gradient boosted tree R^2: %.4f' % grad_boost.score(X_test, y_test))


Gradient boosted tree R^2: 0.9839

In [ ]: