notebook.community

Edit and run



In [2]:

    
import pandas as pd  # check out Modin https://towardsdatascience.com/get-faster-pandas-with-modin-even-on-your-laptops-b527a2eeda74
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add path to APS modules
aps_pth = Path('.').absolute()
print(aps_pth)
if aps_pth not in sys.path:
    sys.path.append(aps_pth)
sns.set(style="white")
#from sklearn.preprocessing import LabelEncoder
#from pprint import pprint

#pd.set_option("display.max_rows",6)









    



D:\Dev\APS



In [8]:

    
data_pth = Path(r'./aps/notebooks/ml_varsom/all.csv')



In [9]:

    
df = pd.read_csv(data_pth, index_col=0)
df.head()









    Out[9]:







  
    
      
      Date1
      avalanche_problem_1_cause_id
      avalanche_problem_1_cause_name
      DangerLevel
      FreezingLevelAltitude
      FreezingLevelTime
      MaxTemperature
      MinTemperature
      Precipitation_MostExposed_Median
      Precipitation_overall_ThirdQuartile
      TemperatureElevation
      WindClassification
      WindClassificationNumerical
      WindDirection
      WindDirectionNumeric
      WindPercentage
      RegionId
      NaturalAvalanches
    
    
      index
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      2018-12-01
      10
      Nedføyket svakt lag med nysnø
      1
      500
      15
      3
      0
      2
      0
      700
      Bris
      4
      SE
      4
      17.906
      3011.0
      0.0
    
    
      1
      2018-12-02
      10
      Nedføyket svakt lag med nysnø
      1
      500
      5
      3
      -2
      1
      0
      700
      Frisk bris
      5
      SE
      4
      12.399
      3011.0
      0.0
    
    
      2
      2018-12-03
      10
      Nedføyket svakt lag med nysnø
      1
      400
      21
      2
      -2
      0
      0
      700
      Bris
      4
      SE
      4
      16.469
      3011.0
      0.0
    
    
      3
      2018-12-04
      10
      Nedføyket svakt lag med nysnø
      1
      400
      1
      0
      -4
      2
      1
      700
      Bris
      4
      SW
      6
      21.010
      3011.0
      0.0
    
    
      4
      2018-12-05
      15
      Dårlig binding mellom lag i fokksnøen
      1
      300
      15
      0
      -3
      6
      4
      700
      Bris
      4
      SW
      6
      56.892
      3011.0
      0.0



In [12]:

    
df.columns









    Out[12]:





Index(['Date1', 'avalanche_problem_1_cause_id',
       'avalanche_problem_1_cause_name', 'DangerLevel',
       'FreezingLevelAltitude', 'FreezingLevelTime', 'MaxTemperature',
       'MinTemperature', 'Precipitation_MostExposed_Median',
       'Precipitation_overall_ThirdQuartile', 'TemperatureElevation',
       'WindClassification', 'WindClassificationNumerical', 'WindDirection',
       'WindDirectionNumeric', 'WindPercentage', 'RegionId',
       'NaturalAvalanches', 'DangerLevel_prevday'],
      dtype='object')



In [10]:

    
df['DangerLevel_prevday'] = df['DangerLevel'].shift(1)



In [11]:

    
df.head()









    Out[11]:







  
    
      
      Date1
      avalanche_problem_1_cause_id
      avalanche_problem_1_cause_name
      DangerLevel
      FreezingLevelAltitude
      FreezingLevelTime
      MaxTemperature
      MinTemperature
      Precipitation_MostExposed_Median
      Precipitation_overall_ThirdQuartile
      TemperatureElevation
      WindClassification
      WindClassificationNumerical
      WindDirection
      WindDirectionNumeric
      WindPercentage
      RegionId
      NaturalAvalanches
      DangerLevel_prevday
    
    
      index
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      2018-12-01
      10
      Nedføyket svakt lag med nysnø
      1
      500
      15
      3
      0
      2
      0
      700
      Bris
      4
      SE
      4
      17.906
      3011.0
      0.0
      NaN
    
    
      1
      2018-12-02
      10
      Nedføyket svakt lag med nysnø
      1
      500
      5
      3
      -2
      1
      0
      700
      Frisk bris
      5
      SE
      4
      12.399
      3011.0
      0.0
      1.0
    
    
      2
      2018-12-03
      10
      Nedføyket svakt lag med nysnø
      1
      400
      21
      2
      -2
      0
      0
      700
      Bris
      4
      SE
      4
      16.469
      3011.0
      0.0
      1.0
    
    
      3
      2018-12-04
      10
      Nedføyket svakt lag med nysnø
      1
      400
      1
      0
      -4
      2
      1
      700
      Bris
      4
      SW
      6
      21.010
      3011.0
      0.0
      1.0
    
    
      4
      2018-12-05
      15
      Dårlig binding mellom lag i fokksnøen
      1
      300
      15
      0
      -3
      6
      4
      700
      Bris
      4
      SW
      6
      56.892
      3011.0
      0.0
      1.0



In [13]:

    
df['AP1_prevday'] = df['avalanche_problem_1_cause_id'].shift(1)
df['NaturalAvalanches_prevday'] = df['NaturalAvalanches'].shift(1)
df['Precipitation_overall_ThirdQuartile_prevday'] = df['Precipitation_overall_ThirdQuartile'].shift(1)
df['Precipitation_MostExposed_Median_prevday'] = df['Precipitation_MostExposed_Median'].shift(1)



In [14]:

    
df.columns









    Out[14]:





Index(['Date1', 'avalanche_problem_1_cause_id',
       'avalanche_problem_1_cause_name', 'DangerLevel',
       'FreezingLevelAltitude', 'FreezingLevelTime', 'MaxTemperature',
       'MinTemperature', 'Precipitation_MostExposed_Median',
       'Precipitation_overall_ThirdQuartile', 'TemperatureElevation',
       'WindClassification', 'WindClassificationNumerical', 'WindDirection',
       'WindDirectionNumeric', 'WindPercentage', 'RegionId',
       'NaturalAvalanches', 'DangerLevel_prevday', 'AP1_prevday',
       'NaturalAvalanches_prevday',
       'Precipitation_overall_ThirdQuartile_prevday',
       'Precipitation_MostExposed_Median_prevday'],
      dtype='object')



In [15]:

    
df.filter(['Precipitation_MostExposed_Median_prevday', 'Precipitation_MostExposed_Median']).head()









    Out[15]:







  
    
      
      Precipitation_MostExposed_Median_prevday
      Precipitation_MostExposed_Median
    
    
      index
      
      
    
  
  
    
      0
      NaN
      2
    
    
      1
      2.0
      1
    
    
      2
      1.0
      0
    
    
      3
      0.0
      2
    
    
      4
      2.0
      6



In [24]:

    
df_ml = df.drop(['Date1', 'avalanche_problem_1_cause_name', 'RegionId', 'WindDirection', 'WindClassification'], axis=1)



In [30]:

    
df_ml.dropna(axis=1, inplace=True)



In [25]:

    
from sklearn import tree
from sklearn.model_selection import train_test_split



In [31]:

    
target_ = 'DangerLevel'
X = df_ml.drop([target_, 'avalanche_problem_1_cause_id'], axis=1)
y = df_ml.filter([target_], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 222, test_size = 0.3)



In [32]:

    
df_ml.describe()









    Out[32]:







  
    
      
      avalanche_problem_1_cause_id
      DangerLevel
      FreezingLevelAltitude
      FreezingLevelTime
      MaxTemperature
      MinTemperature
      Precipitation_MostExposed_Median
      Precipitation_overall_ThirdQuartile
      TemperatureElevation
      WindClassificationNumerical
      WindDirectionNumeric
      WindPercentage
      NaturalAvalanches
    
  
  
    
      count
      182.000000
      182.000000
      182.000000
      182.000000
      182.000000
      182.000000
      182.000000
      182.000000
      182.0
      182.000000
      182.000000
      182.000000
      182.000000
    
    
      mean
      14.186813
      2.236264
      420.329670
      10.307692
      -1.065934
      -5.758242
      5.972527
      4.120879
      700.0
      4.615385
      4.637363
      21.602890
      0.252747
    
    
      std
      4.958194
      0.753777
      674.781652
      6.543002
      4.618728
      4.652192
      8.623129
      5.710967
      0.0
      1.158864
      2.094542
      7.752106
      0.435786
    
    
      min
      10.000000
      1.000000
      0.000000
      0.000000
      -13.000000
      -20.000000
      0.000000
      0.000000
      700.0
      2.000000
      1.000000
      10.368000
      0.000000
    
    
      25%
      10.000000
      2.000000
      100.000000
      5.250000
      -4.000000
      -9.000000
      0.000000
      0.000000
      700.0
      4.000000
      3.000000
      15.656250
      0.000000
    
    
      50%
      14.000000
      2.000000
      200.000000
      11.000000
      -1.000000
      -6.000000
      3.000000
      1.500000
      700.0
      4.000000
      6.000000
      20.235500
      0.000000
    
    
      75%
      15.000000
      3.000000
      400.000000
      15.000000
      2.000000
      -2.000000
      8.000000
      6.000000
      700.0
      5.000000
      6.000000
      26.311500
      0.750000
    
    
      max
      24.000000
      4.000000
      2500.000000
      23.000000
      11.000000
      5.000000
      60.000000
      30.000000
      700.0
      8.000000
      8.000000
      56.892000
      1.000000



In [33]:

    
print(df_ml.isna().sum().sort_values(ascending=False))









    



NaturalAvalanches                      0
WindPercentage                         0
WindDirectionNumeric                   0
WindClassificationNumerical            0
TemperatureElevation                   0
Precipitation_overall_ThirdQuartile    0
Precipitation_MostExposed_Median       0
MinTemperature                         0
MaxTemperature                         0
FreezingLevelTime                      0
FreezingLevelAltitude                  0
DangerLevel                            0
avalanche_problem_1_cause_id           0
dtype: int64



In [40]:

    
score_ = []
for i in range(2,20):
    dec_tree = tree.DecisionTreeRegressor(random_state=222, max_depth = i)
    
    dec_tree.fit(X_train, y_train) # we're using the same data as in last linear model
    
    predictions_dt = dec_tree.predict(X_test)
    print(predictions_dt.shape, y_test.shape)
    score_.append(dec_tree.score(X_test, y_test))
    print(i, 'Decision tree R^2: %.4f' % dec_tree.score(X_test, y_test))









    



(55,) (55, 1)
2 Decision tree R^2: 0.4371
(55,) (55, 1)
3 Decision tree R^2: 0.5185
(55,) (55, 1)
4 Decision tree R^2: 0.4772
(55,) (55, 1)
5 Decision tree R^2: 0.4883
(55,) (55, 1)
6 Decision tree R^2: 0.5176
(55,) (55, 1)
7 Decision tree R^2: 0.5182
(55,) (55, 1)
8 Decision tree R^2: 0.4900
(55,) (55, 1)
9 Decision tree R^2: 0.4866
(55,) (55, 1)
10 Decision tree R^2: 0.4970
(55,) (55, 1)
11 Decision tree R^2: 0.4024
(55,) (55, 1)
12 Decision tree R^2: 0.4850
(55,) (55, 1)
13 Decision tree R^2: 0.4966
(55,) (55, 1)
14 Decision tree R^2: 0.2317
(55,) (55, 1)
15 Decision tree R^2: 0.2317
(55,) (55, 1)
16 Decision tree R^2: 0.2317
(55,) (55, 1)
17 Decision tree R^2: 0.2317
(55,) (55, 1)
18 Decision tree R^2: 0.2317
(55,) (55, 1)
19 Decision tree R^2: 0.2317



In [ ]:

	Date1	avalanche_problem_1_cause_id	avalanche_problem_1_cause_name	DangerLevel	FreezingLevelAltitude	FreezingLevelTime	MaxTemperature	MinTemperature	Precipitation_MostExposed_Median	Precipitation_overall_ThirdQuartile	TemperatureElevation	WindClassification	WindClassificationNumerical	WindDirection	WindDirectionNumeric	WindPercentage	RegionId	NaturalAvalanches
index
0	2018-12-01	10	Nedføyket svakt lag med nysnø	1	500	15	3	0	2	0	700	Bris	4	SE	4	17.906	3011.0	0.0
1	2018-12-02	10	Nedføyket svakt lag med nysnø	1	500	5	3	-2	1	0	700	Frisk bris	5	SE	4	12.399	3011.0	0.0
2	2018-12-03	10	Nedføyket svakt lag med nysnø	1	400	21	2	-2	0	0	700	Bris	4	SE	4	16.469	3011.0	0.0
3	2018-12-04	10	Nedføyket svakt lag med nysnø	1	400	1	0	-4	2	1	700	Bris	4	SW	6	21.010	3011.0	0.0
4	2018-12-05	15	Dårlig binding mellom lag i fokksnøen	1	300	15	0	-3	6	4	700	Bris	4	SW	6	56.892	3011.0	0.0

	avalanche_problem_1_cause_id	DangerLevel	FreezingLevelAltitude	FreezingLevelTime	MaxTemperature	MinTemperature	Precipitation_MostExposed_Median	Precipitation_overall_ThirdQuartile	TemperatureElevation	WindClassificationNumerical	WindDirectionNumeric	WindPercentage	NaturalAvalanches
count	182.000000	182.000000	182.000000	182.000000	182.000000	182.000000	182.000000	182.000000	182.0	182.000000	182.000000	182.000000	182.000000
mean	14.186813	2.236264	420.329670	10.307692	-1.065934	-5.758242	5.972527	4.120879	700.0	4.615385	4.637363	21.602890	0.252747
std	4.958194	0.753777	674.781652	6.543002	4.618728	4.652192	8.623129	5.710967	0.0	1.158864	2.094542	7.752106	0.435786
min	10.000000	1.000000	0.000000	0.000000	-13.000000	-20.000000	0.000000	0.000000	700.0	2.000000	1.000000	10.368000	0.000000
25%	10.000000	2.000000	100.000000	5.250000	-4.000000	-9.000000	0.000000	0.000000	700.0	4.000000	3.000000	15.656250	0.000000
50%	14.000000	2.000000	200.000000	11.000000	-1.000000	-6.000000	3.000000	1.500000	700.0	4.000000	6.000000	20.235500	0.000000
75%	15.000000	3.000000	400.000000	15.000000	2.000000	-2.000000	8.000000	6.000000	700.0	5.000000	6.000000	26.311500	0.750000
max	24.000000	4.000000	2500.000000	23.000000	11.000000	5.000000	60.000000	30.000000	700.0	8.000000	8.000000	56.892000	1.000000