In [2]:
import sys
import pandas as pd
import numpy as np
import json
from flatten_json import flatten
from pprint import pprint
from pathlib import Path
# make sure the aps module is in the pythonpath
APS_dir = str(Path.cwd().parents[1])
if APS_dir not in sys.path:
sys.path.append(APS_dir)
import aps.aps_io.get_forecasts as gf
In [3]:
# Nordvestlandet: Trollheimen, Romsdal, Sunnmøre
#warnings, url = gf.get_warnings_as_json([3022, 3023, 3024], "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)
# Østlandet: Jotunheimen, Hallingdal, Vest-Telemark
#warnings, url = gf.get_warnings_as_json([3028, 3032, 3035], "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)
# Hele Norge
reg_ids_ = list(np.arange(3001, 3047))
warnings, url = gf.get_warnings_as_json(reg_ids_, "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)
### Use this small data extraction for testing
#warnings, url = gf.get_warnings_as_json([3022], "2018-01-01", "2018-01-15", lang_key=2, simple=False, recursive_count=5)
print(url, '\n\n', type(warnings), len(warnings))
In [3]:
# since get_warnings_as_json returns a list, we have to apply "flatten" to each item...
warnings_flattened = (flatten(w, root_keys_to_ignore={'CountyList', 'MunicipalityList'}) for w in warnings)
# TODO: avalanche problems are not labeled correctly by their priority - need to correct.
In [4]:
df = pd.DataFrame(warnings_flattened)
df.head(5)
In [ ]:
#df.columns.values
In [ ]:
In [ ]:
In [ ]:
#df2 = df
In [ ]:
#df = df2
In [ ]:
# create necessary columns and populate them
for index, row in df.iterrows():
for i in range(5):
for j in range(4):
try:
col_name = '{MWType} {MWSubType}'.format(MWType=row['MountainWeather_MeasurementTypes_{0}_Name'.format(i)], MWSubType=row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Name'.format(i, j)])
col_name = col_name.replace(' ', '_')
if col_name in df.columns.values:
df.loc[index, col_name] = row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]
else:
df[col_name] = np.nan
print('Created column: ', col_name)
df.loc[index, col_name] = row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]
#print('{MWType} {MWSubType} = {MWSubTypeValue}'.format(MWType=df.loc[index]['MountainWeather_MeasurementTypes_{0}_Name'.format(i)],
# MWSubType=df.loc[index]['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Name'.format(i, j)],
# MWSubTypeValue=df.loc[index]['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]))
except KeyError:
print('MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}... does not exist - moving on.'.format(i, j))
In [ ]:
df.columns.values
In [ ]:
In [ ]:
In [ ]:
In [ ]:
df[['ValidFrom',
'Temperature_Max',
'Temperature_Min',
'Wind_Direction',
'Wind_Speed']].head(12)
In [ ]:
# store all data before filtering and splitting
df.to_csv('varsom_all.csv', index_label='index')
In [ ]:
with open(r'../config/snoskred_keys.json') as jdata:
snoskred_keys = json.load(jdata)
pprint(snoskred_keys)
In [ ]:
print(df['AvalancheProblems_1_AvalPropagationId'].unique(), type(df['AvalancheProblems_1_AvalPropagationId'].unique()[0]))
In [ ]:
# change wind speeds to numerical values
df['Wind_Speed_Num'] = df['Wind_Speed'].apply(lambda i: snoskred_keys['beaufort_scale_en'][i])
df['Wind_Direction_Num'] = df['Wind_Direction'].apply(lambda i: 0 if i == None else snoskred_keys['wind_dir_conv_en'][i])
# Re-group AvalancheProblemType
# AvalancheProblemType grouped by PWL, wet slab, wet loose, dry loose, storm slab, and wind slab (and glide avalanche).
df['AvalancheProblems_0_Class_AvalancheProblemTypeId'] = df['AvalancheProblems_0_AvalancheProblemTypeId'].apply(lambda i: 0 if i == np.nan else np.int(snoskred_keys['Class_AvalancheProblemTypeId'][str(int(i))]))
df['AvalancheProblems_1_Class_AvalancheProblemTypeId'] = df['AvalancheProblems_1_AvalancheProblemTypeId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalancheProblemTypeId'][str(int(i))]))
# Distribution is labeled _Propagation_ in the API and has five classes. Change name to _AvalDistribution_ and merge the uper three classes into one called _widespread_.
df['AvalancheProblems_0_Class_AvalDistributionId'] = df['AvalancheProblems_0_AvalPropagationId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalDistributionId'][str(int(i))]))
df['AvalancheProblems_1_Class_AvalDistributionId'] = df['AvalancheProblems_1_AvalPropagationId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalDistributionId'][str(int(i))]))
In [ ]:
# Keep only columns that hold numerical values.
# AvalCause, AvalancheExt and AvalancheProblemType are directly correlated - keep only re-grouped ..._Class_AvalancheProblemType.
df_numdata = df.filter(['AvalancheProblems_0_AvalProbabilityId',
'AvalancheProblems_0_Class_AvalDistributionId',
'AvalancheProblems_0_AvalTriggerSimpleId',
'AvalancheProblems_0_AvalancheProblemId',
'AvalancheProblems_0_Class_AvalancheProblemTypeId',
'AvalancheProblems_0_AvalancheTypeId',
'AvalancheProblems_0_DestructiveSizeExtId',
'AvalancheProblems_1_AvalProbabilityId',
'AvalancheProblems_1_Class_AvalDistributionId',
'AvalancheProblems_1_AvalTriggerSimpleId',
'AvalancheProblems_1_AvalancheProblemId',
'AvalancheProblems_1_Class_AvalancheProblemTypeId',
'AvalancheProblems_1_AvalancheTypeId',
'AvalancheProblems_1_DestructiveSizeExtId',
'DangerLevel',
'ValidFrom',
'Rainfall_Most_exposed_area',
'Rainfall_Average',
'Wind_Speed_Num',
'Wind_Direction_Num',
'Temperature_Min',
'Temperature_Max',
'Temperature_masl',
'Freezing_Level_masl'], axis=1)
In [ ]:
df_numdata.fillna(0, inplace=True)
In [ ]:
# Check if there are no weired values.
for col in df_numdata.drop(['ValidFrom'], axis=1).columns.values:
print(col, ': ', df_numdata[col].unique())
In [ ]:
# Remove all columns without data
#df_numdata = df_numdata.dropna(axis=1)
df_numdata.to_csv('varsom_numdata.csv', index_label='index')
In [ ]:
# Randomly shuffle the index of nba.
random_indices = np.random.permutation(df_numdata.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = np.int(np.floor(len(df_numdata)/3))
print(test_cutoff)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
df_test = df_numdata.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data.
df_train = df_numdata.loc[random_indices[test_cutoff:]]
Separate the target variable in its own dataframe.
In [ ]:
df_train_data = df_train.drop(['DangerLevel'], axis=1)
df_test_data = df_test.drop(['DangerLevel'], axis=1)
df_train_target = df_train.filter(['DangerLevel'], axis=1)
df_test_target = df_test.filter(['DangerLevel'], axis=1)
In [ ]:
df_train_data.to_csv('varsom_train_data.csv', index_label='index')
df_test_data.to_csv('varsom_test_data.csv', index_label='index')
df_train_target.to_csv('varsom_train_target.csv', index_label='index')
df_test_target.to_csv('varsom_test_target.csv', index_label='index')
In [ ]: