Create training dataset from varsom data


In [2]:
import sys
import pandas as pd
import numpy as np
import json
from flatten_json import flatten
from pprint import pprint
from pathlib import Path
# make sure the aps module is in the pythonpath
APS_dir = str(Path.cwd().parents[1])
if APS_dir not in sys.path:
    sys.path.append(APS_dir)
import aps.aps_io.get_forecasts as gf


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-2-7103ae1461ca> in <module>()
      3 import numpy as np
      4 import json
----> 5 from flatten_json import flatten
      6 from pprint import pprint
      7 from pathlib import Path

C:\Anaconda3\lib\site-packages\flatten_json.py in <module>()
      4 from collections import Iterable
      5 
----> 6 from util import check_if_numbers_are_consecutive
      7 
      8 

ImportError: cannot import name 'check_if_numbers_are_consecutive'

In [3]:
# Nordvestlandet: Trollheimen, Romsdal, Sunnmøre
#warnings, url = gf.get_warnings_as_json([3022, 3023, 3024], "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)

# Østlandet: Jotunheimen, Hallingdal, Vest-Telemark
#warnings, url = gf.get_warnings_as_json([3028, 3032, 3035], "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)

# Hele Norge
reg_ids_ = list(np.arange(3001, 3047))
warnings, url = gf.get_warnings_as_json(reg_ids_, "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)

### Use this small data extraction for testing
#warnings, url = gf.get_warnings_as_json([3022], "2018-01-01", "2018-01-15", lang_key=2, simple=False, recursive_count=5)

print(url, '\n\n', type(warnings), len(warnings))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-34d06cf26ad3> in <module>()
      8 # Hele Norge
      9 reg_ids_ = list(np.arange(3001, 3047))
---> 10 warnings, url = gf.get_warnings_as_json(reg_ids_, "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)
     11 
     12 ### Use this small data extraction for testing

NameError: name 'gf' is not defined

In [3]:
# since get_warnings_as_json returns a list, we have to apply "flatten" to each item... 
warnings_flattened = (flatten(w, root_keys_to_ignore={'CountyList', 'MunicipalityList'}) for w in warnings)
# TODO: avalanche problems are not labeled correctly by their priority - need to correct.

In [4]:
df = pd.DataFrame(warnings_flattened)
df.head(5)


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-4-7468422b0e5b> in <module>()
----> 1 df = pd.DataFrame(warnings_flattened)
      2 df.head(5)

C:\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
    380         elif isinstance(data, (list, types.GeneratorType)):
    381             if isinstance(data, types.GeneratorType):
--> 382                 data = list(data)
    383             if len(data) > 0:
    384                 if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:

<ipython-input-3-1b38819dffe8> in <genexpr>(.0)
      1 # since get_warnings_as_json returns a list, we have to apply "flatten" to each item...
----> 2 warnings_flattened = (flatten(w, root_keys_to_ignore={'CountyList', 'MunicipalityList'}) for w in warnings)
      3 # TODO: avalanche problems are not labeled correctly by their priority - need to correct.

C:\Anaconda3\lib\site-packages\flatten_json.py in flatten(nested_dict, separator, root_keys_to_ignore)
     32     :return: flattened dictionary
     33     """
---> 34     assert isinstance(nested_dict, dict), "flatten requires a dictionary input"
     35     assert isinstance(separator, str), "separator must be a string"
     36 

AssertionError: flatten requires a dictionary input

In [ ]:
#df.columns.values

In [ ]:


In [ ]:


In [ ]:
#df2 = df

In [ ]:
#df = df2

In [ ]:
# create necessary columns and populate them
for index, row in df.iterrows():
    for i in range(5):
        for j in range(4):
            try:
                col_name = '{MWType} {MWSubType}'.format(MWType=row['MountainWeather_MeasurementTypes_{0}_Name'.format(i)], MWSubType=row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Name'.format(i, j)])
                col_name = col_name.replace(' ', '_')
                if col_name in df.columns.values:
                    df.loc[index, col_name] = row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]
                else:
                    df[col_name] = np.nan
                    print('Created column: ', col_name)
                    df.loc[index, col_name] = row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]
                    
                #print('{MWType} {MWSubType} = {MWSubTypeValue}'.format(MWType=df.loc[index]['MountainWeather_MeasurementTypes_{0}_Name'.format(i)],
                 #                                                      MWSubType=df.loc[index]['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Name'.format(i, j)],
                  #                                                     MWSubTypeValue=df.loc[index]['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]))
            except KeyError:
                print('MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}... does not exist - moving on.'.format(i, j))

In [ ]:
df.columns.values

In [ ]:


In [ ]:


In [ ]:


In [ ]:
df[['ValidFrom',
    'Temperature_Max',
    'Temperature_Min',
    'Wind_Direction',
    'Wind_Speed']].head(12)

In [ ]:
# store all data before filtering and splitting
df.to_csv('varsom_all.csv', index_label='index')

Create training data

We want to predict the main avalanche problem. Therefore we store the AP1_type and AP1_score in its own dataframe. We remove all parameters that are irrelevant for the algorithm, e.g. UtmZone, ValidTo...

Make all columns used in the analysis numerical and adjust indicies where necessary.


In [ ]:
with open(r'../config/snoskred_keys.json') as jdata:
    snoskred_keys = json.load(jdata)

pprint(snoskred_keys)

In [ ]:
print(df['AvalancheProblems_1_AvalPropagationId'].unique(), type(df['AvalancheProblems_1_AvalPropagationId'].unique()[0]))

In [ ]:
# change wind speeds to numerical values
df['Wind_Speed_Num'] = df['Wind_Speed'].apply(lambda i: snoskred_keys['beaufort_scale_en'][i])
df['Wind_Direction_Num'] = df['Wind_Direction'].apply(lambda i: 0 if i == None else snoskred_keys['wind_dir_conv_en'][i])

# Re-group AvalancheProblemType
# AvalancheProblemType grouped by PWL, wet slab, wet loose, dry loose, storm slab, and wind slab (and glide avalanche).
df['AvalancheProblems_0_Class_AvalancheProblemTypeId'] = df['AvalancheProblems_0_AvalancheProblemTypeId'].apply(lambda i: 0 if i == np.nan else np.int(snoskred_keys['Class_AvalancheProblemTypeId'][str(int(i))]))
df['AvalancheProblems_1_Class_AvalancheProblemTypeId'] = df['AvalancheProblems_1_AvalancheProblemTypeId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalancheProblemTypeId'][str(int(i))]))

# Distribution is labeled _Propagation_ in the API and has five classes. Change name to _AvalDistribution_ and merge the uper three classes into one called _widespread_.
df['AvalancheProblems_0_Class_AvalDistributionId'] = df['AvalancheProblems_0_AvalPropagationId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalDistributionId'][str(int(i))]))
df['AvalancheProblems_1_Class_AvalDistributionId'] = df['AvalancheProblems_1_AvalPropagationId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalDistributionId'][str(int(i))]))

In [ ]:
# Keep only columns that hold numerical values.
# AvalCause, AvalancheExt and AvalancheProblemType are directly correlated - keep only re-grouped ..._Class_AvalancheProblemType.
df_numdata = df.filter(['AvalancheProblems_0_AvalProbabilityId',
                        'AvalancheProblems_0_Class_AvalDistributionId',
                        'AvalancheProblems_0_AvalTriggerSimpleId',
                        'AvalancheProblems_0_AvalancheProblemId',
                        'AvalancheProblems_0_Class_AvalancheProblemTypeId',
                        'AvalancheProblems_0_AvalancheTypeId',
                        'AvalancheProblems_0_DestructiveSizeExtId',
                        'AvalancheProblems_1_AvalProbabilityId',
                        'AvalancheProblems_1_Class_AvalDistributionId',
                        'AvalancheProblems_1_AvalTriggerSimpleId',
                        'AvalancheProblems_1_AvalancheProblemId',
                        'AvalancheProblems_1_Class_AvalancheProblemTypeId',
                        'AvalancheProblems_1_AvalancheTypeId',
                        'AvalancheProblems_1_DestructiveSizeExtId',
                        'DangerLevel',
                        'ValidFrom',
                        'Rainfall_Most_exposed_area',
                        'Rainfall_Average',
                        'Wind_Speed_Num',
                        'Wind_Direction_Num',
                        'Temperature_Min',
                        'Temperature_Max',
                        'Temperature_masl',
                        'Freezing_Level_masl'], axis=1)

In [ ]:
df_numdata.fillna(0, inplace=True)

In [ ]:
# Check if there are no weired values.
for col in df_numdata.drop(['ValidFrom'], axis=1).columns.values:
    print(col, ': ', df_numdata[col].unique())

In [ ]:
# Remove all columns without data
#df_numdata = df_numdata.dropna(axis=1)

df_numdata.to_csv('varsom_numdata.csv', index_label='index')

Split data into a training- and test-dataset

Randomly choose indicies that should serve a test data and which are removed from the training data. Maybe write a function that chooses a certain % as test dxata


In [ ]:
# Randomly shuffle the index of nba.
random_indices = np.random.permutation(df_numdata.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = np.int(np.floor(len(df_numdata)/3))
print(test_cutoff)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
df_test = df_numdata.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data.
df_train = df_numdata.loc[random_indices[test_cutoff:]]

Separate the target variable in its own dataframe.


In [ ]:
df_train_data = df_train.drop(['DangerLevel'], axis=1)
df_test_data = df_test.drop(['DangerLevel'], axis=1)

df_train_target = df_train.filter(['DangerLevel'], axis=1)
df_test_target = df_test.filter(['DangerLevel'], axis=1)

In [ ]:
df_train_data.to_csv('varsom_train_data.csv', index_label='index')
df_test_data.to_csv('varsom_test_data.csv', index_label='index')

df_train_target.to_csv('varsom_train_target.csv', index_label='index')
df_test_target.to_csv('varsom_test_target.csv', index_label='index')

In [ ]: