In [2]:
import pandas as pd  # check out Modin https://towardsdatascience.com/get-faster-pandas-with-modin-even-on-your-laptops-b527a2eeda74
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add path to APS modules
aps_pth = Path('.').absolute()
print(aps_pth)
if aps_pth not in sys.path:
    sys.path.append(aps_pth)
sns.set(style="white")
#from sklearn.preprocessing import LabelEncoder
#from pprint import pprint

#pd.set_option("display.max_rows",6)


D:\Dev\APS

In [8]:
data_pth = Path(r'./aps/notebooks/ml_varsom/all.csv')

In [9]:
df = pd.read_csv(data_pth, index_col=0)
df.head()


Out[9]:
Date1 avalanche_problem_1_cause_id avalanche_problem_1_cause_name DangerLevel FreezingLevelAltitude FreezingLevelTime MaxTemperature MinTemperature Precipitation_MostExposed_Median Precipitation_overall_ThirdQuartile TemperatureElevation WindClassification WindClassificationNumerical WindDirection WindDirectionNumeric WindPercentage RegionId NaturalAvalanches
index
0 2018-12-01 10 Nedføyket svakt lag med nysnø 1 500 15 3 0 2 0 700 Bris 4 SE 4 17.906 3011.0 0.0
1 2018-12-02 10 Nedføyket svakt lag med nysnø 1 500 5 3 -2 1 0 700 Frisk bris 5 SE 4 12.399 3011.0 0.0
2 2018-12-03 10 Nedføyket svakt lag med nysnø 1 400 21 2 -2 0 0 700 Bris 4 SE 4 16.469 3011.0 0.0
3 2018-12-04 10 Nedføyket svakt lag med nysnø 1 400 1 0 -4 2 1 700 Bris 4 SW 6 21.010 3011.0 0.0
4 2018-12-05 15 Dårlig binding mellom lag i fokksnøen 1 300 15 0 -3 6 4 700 Bris 4 SW 6 56.892 3011.0 0.0

In [12]:
df.columns


Out[12]:
Index(['Date1', 'avalanche_problem_1_cause_id',
       'avalanche_problem_1_cause_name', 'DangerLevel',
       'FreezingLevelAltitude', 'FreezingLevelTime', 'MaxTemperature',
       'MinTemperature', 'Precipitation_MostExposed_Median',
       'Precipitation_overall_ThirdQuartile', 'TemperatureElevation',
       'WindClassification', 'WindClassificationNumerical', 'WindDirection',
       'WindDirectionNumeric', 'WindPercentage', 'RegionId',
       'NaturalAvalanches', 'DangerLevel_prevday'],
      dtype='object')

In [10]:
df['DangerLevel_prevday'] = df['DangerLevel'].shift(1)

In [11]:
df.head()


Out[11]:
Date1 avalanche_problem_1_cause_id avalanche_problem_1_cause_name DangerLevel FreezingLevelAltitude FreezingLevelTime MaxTemperature MinTemperature Precipitation_MostExposed_Median Precipitation_overall_ThirdQuartile TemperatureElevation WindClassification WindClassificationNumerical WindDirection WindDirectionNumeric WindPercentage RegionId NaturalAvalanches DangerLevel_prevday
index
0 2018-12-01 10 Nedføyket svakt lag med nysnø 1 500 15 3 0 2 0 700 Bris 4 SE 4 17.906 3011.0 0.0 NaN
1 2018-12-02 10 Nedføyket svakt lag med nysnø 1 500 5 3 -2 1 0 700 Frisk bris 5 SE 4 12.399 3011.0 0.0 1.0
2 2018-12-03 10 Nedføyket svakt lag med nysnø 1 400 21 2 -2 0 0 700 Bris 4 SE 4 16.469 3011.0 0.0 1.0
3 2018-12-04 10 Nedføyket svakt lag med nysnø 1 400 1 0 -4 2 1 700 Bris 4 SW 6 21.010 3011.0 0.0 1.0
4 2018-12-05 15 Dårlig binding mellom lag i fokksnøen 1 300 15 0 -3 6 4 700 Bris 4 SW 6 56.892 3011.0 0.0 1.0

In [13]:
df['AP1_prevday'] = df['avalanche_problem_1_cause_id'].shift(1)
df['NaturalAvalanches_prevday'] = df['NaturalAvalanches'].shift(1)
df['Precipitation_overall_ThirdQuartile_prevday'] = df['Precipitation_overall_ThirdQuartile'].shift(1)
df['Precipitation_MostExposed_Median_prevday'] = df['Precipitation_MostExposed_Median'].shift(1)

In [14]:
df.columns


Out[14]:
Index(['Date1', 'avalanche_problem_1_cause_id',
       'avalanche_problem_1_cause_name', 'DangerLevel',
       'FreezingLevelAltitude', 'FreezingLevelTime', 'MaxTemperature',
       'MinTemperature', 'Precipitation_MostExposed_Median',
       'Precipitation_overall_ThirdQuartile', 'TemperatureElevation',
       'WindClassification', 'WindClassificationNumerical', 'WindDirection',
       'WindDirectionNumeric', 'WindPercentage', 'RegionId',
       'NaturalAvalanches', 'DangerLevel_prevday', 'AP1_prevday',
       'NaturalAvalanches_prevday',
       'Precipitation_overall_ThirdQuartile_prevday',
       'Precipitation_MostExposed_Median_prevday'],
      dtype='object')

In [15]:
df.filter(['Precipitation_MostExposed_Median_prevday', 'Precipitation_MostExposed_Median']).head()


Out[15]:
Precipitation_MostExposed_Median_prevday Precipitation_MostExposed_Median
index
0 NaN 2
1 2.0 1
2 1.0 0
3 0.0 2
4 2.0 6

In [24]:
df_ml = df.drop(['Date1', 'avalanche_problem_1_cause_name', 'RegionId', 'WindDirection', 'WindClassification'], axis=1)

In [30]:
df_ml.dropna(axis=1, inplace=True)

In [25]:
from sklearn import tree
from sklearn.model_selection import train_test_split

In [31]:
target_ = 'DangerLevel'
X = df_ml.drop([target_, 'avalanche_problem_1_cause_id'], axis=1)
y = df_ml.filter([target_], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 222, test_size = 0.3)

In [32]:
df_ml.describe()


Out[32]:
avalanche_problem_1_cause_id DangerLevel FreezingLevelAltitude FreezingLevelTime MaxTemperature MinTemperature Precipitation_MostExposed_Median Precipitation_overall_ThirdQuartile TemperatureElevation WindClassificationNumerical WindDirectionNumeric WindPercentage NaturalAvalanches
count 182.000000 182.000000 182.000000 182.000000 182.000000 182.000000 182.000000 182.000000 182.0 182.000000 182.000000 182.000000 182.000000
mean 14.186813 2.236264 420.329670 10.307692 -1.065934 -5.758242 5.972527 4.120879 700.0 4.615385 4.637363 21.602890 0.252747
std 4.958194 0.753777 674.781652 6.543002 4.618728 4.652192 8.623129 5.710967 0.0 1.158864 2.094542 7.752106 0.435786
min 10.000000 1.000000 0.000000 0.000000 -13.000000 -20.000000 0.000000 0.000000 700.0 2.000000 1.000000 10.368000 0.000000
25% 10.000000 2.000000 100.000000 5.250000 -4.000000 -9.000000 0.000000 0.000000 700.0 4.000000 3.000000 15.656250 0.000000
50% 14.000000 2.000000 200.000000 11.000000 -1.000000 -6.000000 3.000000 1.500000 700.0 4.000000 6.000000 20.235500 0.000000
75% 15.000000 3.000000 400.000000 15.000000 2.000000 -2.000000 8.000000 6.000000 700.0 5.000000 6.000000 26.311500 0.750000
max 24.000000 4.000000 2500.000000 23.000000 11.000000 5.000000 60.000000 30.000000 700.0 8.000000 8.000000 56.892000 1.000000

In [33]:
print(df_ml.isna().sum().sort_values(ascending=False))


NaturalAvalanches                      0
WindPercentage                         0
WindDirectionNumeric                   0
WindClassificationNumerical            0
TemperatureElevation                   0
Precipitation_overall_ThirdQuartile    0
Precipitation_MostExposed_Median       0
MinTemperature                         0
MaxTemperature                         0
FreezingLevelTime                      0
FreezingLevelAltitude                  0
DangerLevel                            0
avalanche_problem_1_cause_id           0
dtype: int64

In [40]:
score_ = []
for i in range(2,20):
    dec_tree = tree.DecisionTreeRegressor(random_state=222, max_depth = i)
    
    dec_tree.fit(X_train, y_train) # we're using the same data as in last linear model
    
    predictions_dt = dec_tree.predict(X_test)
    print(predictions_dt.shape, y_test.shape)
    score_.append(dec_tree.score(X_test, y_test))
    print(i, 'Decision tree R^2: %.4f' % dec_tree.score(X_test, y_test))


(55,) (55, 1)
2 Decision tree R^2: 0.4371
(55,) (55, 1)
3 Decision tree R^2: 0.5185
(55,) (55, 1)
4 Decision tree R^2: 0.4772
(55,) (55, 1)
5 Decision tree R^2: 0.4883
(55,) (55, 1)
6 Decision tree R^2: 0.5176
(55,) (55, 1)
7 Decision tree R^2: 0.5182
(55,) (55, 1)
8 Decision tree R^2: 0.4900
(55,) (55, 1)
9 Decision tree R^2: 0.4866
(55,) (55, 1)
10 Decision tree R^2: 0.4970
(55,) (55, 1)
11 Decision tree R^2: 0.4024
(55,) (55, 1)
12 Decision tree R^2: 0.4850
(55,) (55, 1)
13 Decision tree R^2: 0.4966
(55,) (55, 1)
14 Decision tree R^2: 0.2317
(55,) (55, 1)
15 Decision tree R^2: 0.2317
(55,) (55, 1)
16 Decision tree R^2: 0.2317
(55,) (55, 1)
17 Decision tree R^2: 0.2317
(55,) (55, 1)
18 Decision tree R^2: 0.2317
(55,) (55, 1)
19 Decision tree R^2: 0.2317

In [ ]: