In [ ]:
In [11]:
import pandas as pd
import numpy as np
import json
import graphviz
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from pprint import pprint
from pathlib import Path
pd.set_option("display.max_rows",6)
%matplotlib inline
In [15]:
aps_path = Path('./aps/notebooks').absolute()
print(aps_path)
In [8]:
# analysis of entire data set
df_train_data = pd.read_csv(aps_path / 'varsom_numdata.csv', index_col=0)
In [16]:
# used to train and test
df_train_data = pd.read_csv(aps_path / 'varsom_train_data.csv', index_col=0)
df_test_data = pd.read_csv(aps_path / 'varsom_test_data.csv', index_col=0)
df_train_target = pd.read_csv(aps_path / 'varsom_train_target.csv', index_col=0)
df_test_target = pd.read_csv(aps_path / 'varsom_test_target.csv', index_col=0)
In [17]:
%pwd
Out[17]:
In [17]:
In [17]:
In [18]:
df_train_data.head(5)
Out[18]:
In [19]:
df_train_data.columns.values
Out[19]:
In [20]:
# Check if there are no weired values.
for col in df_train_data.drop(['ValidFrom'], axis=1).columns.values:
print(col, ': ', df_train_data[col].unique())
In [21]:
### Remove the "2|" in column Rainfall_Average
#df_train_data = df_train_data[df_train_data['Rainfall_Average'] != '2|']
#print(df_train_data['Rainfall_Average'].unique())
In [22]:
#df_train_data.plot(x='ValidFrom', y='AvalancheProblems_0_AvalancheProblemId', figsize=(16,10), kind='bar')
df_train_data['AvalancheProblems_0_AvalancheProblemId'].plot(figsize=(10,10), kind='hist')
#df_train_data.plot(subplots=True, figsize=(16,20))
Out[22]:
In [25]:
#df_train_target = df_train_data.filter(['DangerLevel'], axis=1)
#df_train_data = df_train_data.drop(['DangerLevel'], axis=1)
data = np.array(df_train_data.drop(['ValidFrom'], axis=1).values, dtype=float)
target = np.array(df_train_target.values, dtype=float)
print(df_train_data.drop(['ValidFrom'], axis=1).columns.values,
df_train_target.columns.values)
In [24]:
clf = tree.DecisionTreeClassifier(min_samples_leaf=8)
clf = clf.fit(data, target)
In [ ]:
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names = df_train_data.drop(['ValidFrom'], axis=1).columns.values,
class_names = ['low', 'moderate', 'considerable', 'high'],
#proportion = True, # show precentages instead of members
label = "root",
filled=True, rounded=True, special_characters=True
)
graph = graphviz.Source(dot_data)
graph.render("varsom")
In [ ]:
df_train_data2 = pd.read_csv('varsel_nordvestlandet_17_18.csv', index_col=0)
pprint(df_train_data2.columns.values)
In [ ]:
### Remove the "2|" in column Rainfall_Average
df_train_data2 = df_train_data2[df_train_data2['Rainfall_Average'] != '2|']
#print(df_train_data['Rainfall_Average'].unique())
In [ ]:
for col in df_train_data2.columns.values:
print(col, ': ', df_train_data2[col].unique())
In [ ]:
# remove unwanted columns
df_train_target2 = df_train_data2.filter(['AvalancheProblems_0_Class_AvalancheProblemTypeId'], axis=1)
df_train_data2 = df_train_data2.filter(['Rainfall_Most_exposed_area',
'Rainfall_Average',
'Wind_Speed_Num',
'Wind_Direction_Num',
'Temperature_Min',
'Temperature_Max',
'Temperature_masl',
'Freezing_Level_masl'], axis=1)
In [ ]:
# create new data columns with previous days weather data
df_train_data2['Rainfall_Most_exposed_area_-1day'] = 0 # precip on the day before
for index, row in df_train_data2.iterrows():
try:
df_train_data2.loc[index, 'Rainfall_Most_exposed_area_-1day'] = df_train_data2.loc[index-1, 'Rainfall_Most_exposed_area']
except KeyError:
print(index-1)
In [ ]:
# get the correct target labels
with open(r'../config/snoskred_keys.json') as jdata:
snoskred_keys = json.load(jdata)
enc = LabelEncoder()
label_encoder = enc.fit(df_train_target2['AvalancheProblems_0_Class_AvalancheProblemTypeId'])
print ("Categorical classes:", label_encoder.classes_)
class_names2 = []
for l in label_encoder.classes_:
class_names2.append(snoskred_keys['Class_AvalancheProblemTypeName'][str(l)])
print(class_names2)
In [ ]:
data2 = np.array(df_train_data2.values, dtype=float)
target2 = np.array(df_train_target2.values, dtype=float)
clf2 = tree.DecisionTreeClassifier(min_samples_leaf=8)
clf2 = clf2.fit(data2, target2)
dot_data2 = tree.export_graphviz(clf2, out_file=None,
feature_names = df_train_data2.columns.values,
class_names = class_names2,
#proportion = True, # show precentages instead of members
label = "root",
filled=True, rounded=True, special_characters=True
)
graph2 = graphviz.Source(dot_data2)
graph2.render("avalanche_problem_meteo")
In [ ]:
In [ ]:
In [ ]:
clf2 = tree.DecisionTreeRegressor()
clf2 = clf2.fit(data, target)
dot_data2 = tree.export_graphviz(clf2, out_file=None,
feature_names = df_train_data.drop(['ValidFrom', 'ValidTo'], axis=1).columns.values,
filled=True, rounded=True, special_characters=True
)
graph2 = graphviz.Source(dot_data2)
graph2.render("varsom_reg")
In [ ]:
In [ ]:
In [ ]: