In [2]:
import pandas as pd # check out Modin https://towardsdatascience.com/get-faster-pandas-with-modin-even-on-your-laptops-b527a2eeda74
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
# Add path to APS modules
aps_pth = Path('.').absolute()
print(aps_pth)
if aps_pth not in sys.path:
sys.path.append(aps_pth)
sns.set(style="white")
#from sklearn.preprocessing import LabelEncoder
#from pprint import pprint
#pd.set_option("display.max_rows",6)
In [8]:
data_pth = Path(r'./aps/notebooks/ml_varsom/all.csv')
In [9]:
df = pd.read_csv(data_pth, index_col=0)
df.head()
Out[9]:
In [12]:
df.columns
Out[12]:
In [10]:
df['DangerLevel_prevday'] = df['DangerLevel'].shift(1)
In [11]:
df.head()
Out[11]:
In [13]:
df['AP1_prevday'] = df['avalanche_problem_1_cause_id'].shift(1)
df['NaturalAvalanches_prevday'] = df['NaturalAvalanches'].shift(1)
df['Precipitation_overall_ThirdQuartile_prevday'] = df['Precipitation_overall_ThirdQuartile'].shift(1)
df['Precipitation_MostExposed_Median_prevday'] = df['Precipitation_MostExposed_Median'].shift(1)
In [14]:
df.columns
Out[14]:
In [15]:
df.filter(['Precipitation_MostExposed_Median_prevday', 'Precipitation_MostExposed_Median']).head()
Out[15]:
In [24]:
df_ml = df.drop(['Date1', 'avalanche_problem_1_cause_name', 'RegionId', 'WindDirection', 'WindClassification'], axis=1)
In [30]:
df_ml.dropna(axis=1, inplace=True)
In [25]:
from sklearn import tree
from sklearn.model_selection import train_test_split
In [31]:
target_ = 'DangerLevel'
X = df_ml.drop([target_, 'avalanche_problem_1_cause_id'], axis=1)
y = df_ml.filter([target_], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 222, test_size = 0.3)
In [32]:
df_ml.describe()
Out[32]:
In [33]:
print(df_ml.isna().sum().sort_values(ascending=False))
In [40]:
score_ = []
for i in range(2,20):
dec_tree = tree.DecisionTreeRegressor(random_state=222, max_depth = i)
dec_tree.fit(X_train, y_train) # we're using the same data as in last linear model
predictions_dt = dec_tree.predict(X_test)
print(predictions_dt.shape, y_test.shape)
score_.append(dec_tree.score(X_test, y_test))
print(i, 'Decision tree R^2: %.4f' % dec_tree.score(X_test, y_test))
In [ ]: