Create graphs from varsom data using plotly


In [2]:
import pandas as pd
import numpy as np
import json
import logging

#import plotly.offline as py
#py.download_plotlyjs()
#py.init_notebook_mode()
import plotly.plotly as py
import plotly.graph_objs as go
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [3]:


In [5]:
df = pd.read_csv(r'D:\Dev\APS\aps\data\varsom\norwegian_avalanche_warnings_season_17_18.csv', index_col=0, parse_dates=['valid_from', 'valid_to', 'date_valid'])
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3822 entries, 0 to 3821
Columns: 101 entries, author to valid_to
dtypes: datetime64[ns](3), float64(10), int64(46), object(42)
memory usage: 3.0+ MB

In [6]:
df.head(4)


Out[6]:
author avalanche_danger avalanche_problem_1_advice avalanche_problem_1_cause_id avalanche_problem_1_cause_name avalanche_problem_1_destructive_size_ext_id avalanche_problem_1_destructive_size_ext_name avalanche_problem_1_distribution_id avalanche_problem_1_distribution_name avalanche_problem_1_exposed_height_1 ... region_id region_name region_type_id region_type_name snow_surface utm_east utm_north utm_zone valid_from valid_to
index
0 Karsten@NVE Det er lite snø og generelt stabile forhold i ... Vær varsom der skredproblemet er å finne i ko... 15 Dårlig binding mellom lag i fokksnøen 2 2 - Middels 1 Få bratte heng 0 ... 3003 Nordenskiöld Land 10 A Det er generelt lite snø i terrenget. Rygger e... 520332 8663904 33 2017-12-01 2017-12-01 23:59:59
1 jan arild@obskorps Kraftig vind fra sørøst vil gi pålagring av fe... Vær forsiktig i områder brattere enn 30 grader... 10 Nedføyket svakt lag med nysnø 2 2 - Middels 2 Noen bratte heng 400 ... 3007 Vest-Finnmark 10 A Siste dagene har det kommet 25-30 cm snø utsat... 802123 7794717 33 2017-12-01 2017-12-01 23:59:59
2 jan arild@obskorps Generelle stabile forhold, men vindøkning fra ... Vær forsiktig i områder brattere enn 30 grader... 10 Nedføyket svakt lag med nysnø 2 2 - Middels 2 Noen bratte heng 400 ... 3009 Nord-Troms 10 A Siste dagene har det kommet 20-30 cm snø utsat... 750984 7742562 33 2017-12-01 2017-12-01 23:59:59
3 jan arild@obskorps Vindøkning fra sør vil gi pålagring av fersk f... Vær forsiktig i områder brattere enn 30 grader... 10 Nedføyket svakt lag med nysnø 2 2 - Middels 2 Noen bratte heng 400 ... 3010 Lyngen 10 A Siste dagene har det kommet 10-20 cm snø utsat... 692056 7719872 33 2017-12-01 2017-12-01 23:59:59

4 rows × 101 columns


In [7]:
x = df[df.region_id==3034]['date_valid']
y = df[df.region_id==3034]['danger_level']
data = [
    go.Bar(
        x=x,
        y=y
    )
]
py.iplot(data, filename='simple-bar-chart')


Out[7]:

In [43]:
#df[df.region_id==3034]["mountain_weather_temperature_max"] = np.where(df[df.region_id==3034]["mountain_weather_temperature_max"] <-50, np.nan)
#df[df.region_id==3034]["mountain_weather_temperature_max"].replace(df[df.region_id==3034]["mountain_weather_temperature_max"] <-50, 0 ,inplace=True)

x = df[df.region_id==3034]['date_valid']
y1 = df[df.region_id==3034]['mountain_weather_precip_region']
y2 = df[df.region_id==3034]['mountain_weather_temperature_max']#.replace(df[df.region_id==3034]["mountain_weather_temperature_max"] <-50, 0, inplace=True)
data = [
    go.Bar(
        x=x,
        y=y1,
        name='Precipitation'
    ),
    go.Scatter(
        x=x,
        y=y2,
        mode = 'lines+markers',  # 'lines' / 'markers'
        name='Temperature',
        yaxis = 'y2'
    )
]

layout = go.Layout(
    title='Precipitation & Temperature',
    yaxis=dict(
        title='Precipitation (mm)'
    ),
    yaxis2=dict(
        title='Temperature (C)',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='combined-line-bar-chart')


Out[43]:

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [23]:
#aw1718.plot.bar(x='date_valid', y=['avalanche_problem_1_destructive_size_ext_id', 'avalanche_problem_2_destructive_size_ext_id','avalanche_problem_3_destructive_size_ext_id'], ax = ax)
_reg_id = 3034

x = df[df.region_id==_reg_id]['date_valid']
y1 = df[df.region_id==_reg_id]['avalanche_problem_1_destructive_size_ext_id']
y2 = df[df.region_id==_reg_id]['avalanche_problem_2_destructive_size_ext_id']
y3 = df[df.region_id==_reg_id]['avalanche_problem_3_destructive_size_ext_id']

data = [
    go.Bar(
        x=x,
        y=y1,
        name='Problem 1'
    ),
    go.Bar(
        x=x,
        y=y2,
        name='Problem 2'
    ),
    go.Bar(
        x=x,
        y=y3,
        name='Problem 3'
    )
]

layout = go.Layout(
    barmode='group',
    title='Grouped Bar with Pandas'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='pandas-bar-chart-layout')


Out[23]:

In [30]:
# or more compact
df_ = df[df.region_id==3014]
data = []
for i in range(1,4):
    data.append(go.Bar(x=df_['date_valid'], y=df_[f'avalanche_problem_{i}_destructive_size_ext_id'], name=f'Problem {i}'))

layout = go.Layout(
    barmode = 'stack',
    title = f'Avalanche size by avalanche problem\nRegion: {df_.region_name.iloc[0]}'
    )

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='aval_size_per_problem')


C:\Anaconda3\lib\site-packages\plotly\tools.py:1424: UserWarning:

Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!

Out[30]:

In [40]:
df_ = df[df.region_id==3014]
x = df_['date_valid']
y = ['Problem 1', 'Problem 2', 'Problem 3']
z = []
for i in range(1,4):
    z.append(df_[f'avalanche_problem_{i}_destructive_size_ext_id'])

trace = go.Heatmap(x=x, y=y, z=z,
                  colorscale = 'Viridis')
data=[trace]
py.iplot(data, filename='labelled-heatmap')


Out[40]:

frost.met.no example - Get a time series


In [44]:
import dateutil.parser as dp
import requests

In [52]:
#!/usr/bin/python

"""

This program shows how to retrieve a time series of observations from the following
combination of source, element and time range:

source:     SN18700
element:    mean(wind_speed P1D)
time range: 2010-04-01 .. 2010-05-31

The time series is written to standard output as lines of the form:

  <observation time as date/time in ISO 8601 format> \
  <observation time as seconds since 1970-01-01T00:00:00> \
  <observed value>

Save the program to a file example.py, make it executable (chmod 755 example.py),
and run it e.g. like this:

  $ CLIENTID=8e6378f7-b3-ae4fe-683f-0db1eb31b24ec ./example.py

(Note: the client ID used in the example should be replaced with a real one)

The program has been tested on the following platforms:
  - Python 2.7.3 on Ubuntu 12.04 Precise
  - Python 2.7.12 and 3.5.2 on Ubuntu 16.04 Xenial

"""
client_id = '513df91a-2f62-4445-b4ba-0b3a1c4d16bb'

# issue an HTTP GET request
r = requests.get(
    'https://frost.met.no/observations/v0.jsonld',
    {'sources': 'SN18700', 'elements': 'mean(wind_speed P1D)', 'referencetime': '2018-12-01/2018-12-15'},
    auth=(client_id, '')
)

# extract the time series from the response
if r.status_code == 200:
    #print(r.json()['data'])
    for item in r.json()['data']:
        iso8601 = item['referenceTime']
        #print(iso8601)
        #secsSince1970 = dp.parse(iso8601).strftime('%s')
        print('{} {}\n'.format(iso8601, item['observations'][0]['value']))
else:
    print('error:\n')
    print('\tstatus code: {}\n'.format(r.status_code))
    if 'error' in r.json():
        assert(r.json()['error']['code'] == r.status_code)
        print('\tmessage: {}\n'.format(r.json()['error']['message']))
        print('\treason: {}\n'.format(r.json()['error']['reason']))
    else:
        print('\tother error\n')


2018-12-01T00:00:00.000Z 3.2

2018-12-02T00:00:00.000Z 1.8

2018-12-03T00:00:00.000Z 2.3

2018-12-04T00:00:00.000Z 3.5

2018-12-05T00:00:00.000Z 1.7

2018-12-06T00:00:00.000Z 2.6

2018-12-07T00:00:00.000Z 2.9

2018-12-08T00:00:00.000Z 2

2018-12-09T00:00:00.000Z 0.9

2018-12-10T00:00:00.000Z 3.8

2018-12-11T00:00:00.000Z 3.9

2018-12-12T00:00:00.000Z 1.7

2018-12-13T00:00:00.000Z 2.8

2018-12-14T00:00:00.000Z 2.2


In [ ]:


In [ ]:
# Nordvestlandet: Trollheimen, Romsdal, Sunnmøre
warnings, url = gf.get_warnings_as_json([3022, 3023, 3024], "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)

# Østlandet: Jotunheimen, Hallingdal, Vest-Telemark
#warnings, url = gf.get_warnings_as_json([3028, 3032, 3035], "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)


### Use this small data extraction for testing
#warnings, url = gf.get_warnings_as_json([3022], "2018-01-01", "2018-01-15", lang_key=2, simple=False, recursive_count=5)

print(url, '\n\n', type(warnings), len(warnings))

In [ ]:
pprint(warnings)

In [ ]:
# since get_warnings_as_json returns a list, we have to apply "flatten" to each item... 
warnings_flattened = (flatten(w, root_keys_to_ignore={'CountyList', 'MunicipalityList'}) for w in warnings)
# TODO: avalanche problems are not labeled correctly by their priority - need to correct.

In [ ]:
df = pd.DataFrame(warnings_flattened)
df.head(5)

In [ ]:
df.info()

In [ ]:
df.describe()

In [ ]:
df.drop_duplicates(keep='first', inplace=True)

In [ ]:
# save the current dataset asa csv
df.to_csv('forecasts_raw.csv', index_label='index')

Start pre-processing


In [ ]:
# create necessary columns and populate them
error_count = 0
log_file = r'./log/mountain_weather.log'
logging.basicConfig(filename=log_file, level=logging.DEBUG)
for index, row in df.iterrows():
    for i in range(5):
        for j in range(4):
            try:
                col_name = '{MWType} {MWSubType}'.format(MWType=row['MountainWeather_MeasurementTypes_{0}_Name'.format(i)], MWSubType=row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Name'.format(i, j)])
                col_name = col_name.replace(' ', '_')
                if col_name in df.columns.values:
                    df.loc[index, col_name] = row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]
                else:
                    df[col_name] = np.nan
                    print('Created column: ', col_name)
                    df.loc[index, col_name] = row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]
                    
                #print('{MWType} {MWSubType} = {MWSubTypeValue}'.format(MWType=df.loc[index]['MountainWeather_MeasurementTypes_{0}_Name'.format(i)],
                 #                                                      MWSubType=df.loc[index]['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Name'.format(i, j)],
                  #                                                     MWSubTypeValue=df.loc[index]['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]))
            except KeyError:
                error_count += 1
                logging.info('MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}... does not exist - moving on.'.format(i, j))
print(f"(Encountered {error_count} KeyError(s) - see log in {log_file})")

In [ ]:
# TODO: remove all MountainWeather_Measurement... columns
# df.columns.values

In [ ]:


In [ ]:


In [ ]:
df[['ValidFrom',
    'Temperature_Max',
    'Temperature_Min',
    'Wind_Direction',
    'Wind_Speed']].describe()

Encoding categorial attributes


In [ ]:
# load json-file with the wanted encoding
with open(r'../config/snoskred_keys.json') as jdata:
    snoskred_keys = json.load(jdata)

# print content
pprint(snoskred_keys)

In [ ]:
enc_df = df.copy()

In [ ]:
#print(df['AvalancheProblems_0_AvalPropagationId'].unique(), type(df['AvalancheProblems_0_AvalPropagationId'].unique()[0]))
print(enc_df['Wind_Direction'].unique())

In [ ]:
enc_df['Wind_Direction'].replace({None: 'Not given'}, inplace=True)

In [ ]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
wind_encoded = encoder.fit_transform(enc_df["Wind_Direction"])

for id_, class_ in enumerate(encoder.classes_):
    print(f"class id {id_} has label {class_}")

print()
print(f"Encoded wind direction values for first 10 entries: {wind_encoded[:10]}")

In [ ]:
# change wind speeds to numerical values
df['Wind_Speed_Num'] = df['Wind_Speed'].apply(lambda i: snoskred_keys['beaufort_scale_en'][i])
df['Wind_Direction_Num'] = df['Wind_Direction'].apply(lambda i: 0 if i == None else snoskred_keys['wind_dir_conv_en'][i])

# Re-group AvalancheProblemType
# AvalancheProblemType grouped by PWL, wet slab, wet loose, dry loose, storm slab, and wind slab (and glide avalanche).
df['AvalancheProblems_0_Class_AvalancheProblemType_Num'] = df['AvalancheProblems_0_AvalancheProblemTypeId'].apply(lambda i: 0 if i == np.nan else np.int(snoskred_keys['Class_AvalancheProblemTypeId'][str(int(i))]))
df['AvalancheProblems_1_Class_AvalancheProblemType_Num'] = df['AvalancheProblems_1_AvalancheProblemTypeId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalancheProblemTypeId'][str(int(i))]))

# Distribution is labeled _Propagation_ in the API and has five classes. Change name to _AvalDistribution_ and merge the uper three classes into one called _widespread_.
df['AvalancheProblems_0_Class_AvalDistribution_Num'] = df['AvalancheProblems_0_AvalPropagationId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalDistributionId'][str(int(i))]))
df['AvalancheProblems_1_Class_AvalDistribution_Num'] = df['AvalancheProblems_1_AvalPropagationId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalDistributionId'][str(int(i))]))

In [ ]:
# Keep only columns that hold numerical values.
# AvalCause, AvalancheExt and AvalancheProblemType are directly correlated - keep only re-grouped ..._Class_AvalancheProblemType.
df_numdata = df.filter(['AvalancheProblems_0_AvalProbabilityId',
                        'AvalancheProblems_0_Class_AvalDistributionId',
                        'AvalancheProblems_0_AvalTriggerSimpleId',
                        'AvalancheProblems_0_AvalancheProblemId',
                        'AvalancheProblems_0_Class_AvalancheProblemTypeId',
                        'AvalancheProblems_0_AvalancheTypeId',
                        'AvalancheProblems_0_DestructiveSizeExtId',
                        'AvalancheProblems_1_AvalProbabilityId',
                        'AvalancheProblems_1_Class_AvalDistributionId',
                        'AvalancheProblems_1_AvalTriggerSimpleId',
                        'AvalancheProblems_1_AvalancheProblemId',
                        'AvalancheProblems_1_Class_AvalancheProblemTypeId',
                        'AvalancheProblems_1_AvalancheTypeId',
                        'AvalancheProblems_1_DestructiveSizeExtId',
                        'DangerLevel',
                        'ValidFrom',
                        'Rainfall_Most_exposed_area',
                        'Rainfall_Average',
                        'Wind_Speed_Num',
                        'Wind_Direction_Num',
                        'Temperature_Min',
                        'Temperature_Max',
                        'Temperature_masl',
                        'Freezing_Level_masl'], axis=1)

In [ ]:
df_numdata.fillna(0, inplace=True)

In [ ]:
# Check if there are no weired values.
for col in df_numdata.drop(['ValidFrom'], axis=1).columns.values:
    print(col, ': ', df_numdata[col].unique())

In [ ]:
df_numdata.hist(bins=50, figsize=(25, 20))
plt.show()

In [ ]:
# Remove all columns without data
#df_numdata = df_numdata.dropna(axis=1)

df_numdata.to_csv('varsom_numdata.csv', index_label='index')

Split data into a training- and test-dataset

Randomly choose indicies that should serve a test data and which are removed from the training data. Maybe write a function that chooses a certain % as test dxata


In [ ]:
# Randomly shuffle the index of nba.
random_indices = np.random.permutation(df_numdata.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = np.int(np.floor(len(df_numdata)/3))
print(test_cutoff)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
df_test = df_numdata.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data.
df_train = df_numdata.loc[random_indices[test_cutoff:]]

Separate the target variable in its own dataframe.


In [ ]:
df_train_data = df_train.drop(['DangerLevel'], axis=1)
df_test_data = df_test.drop(['DangerLevel'], axis=1)

df_train_target = df_train.filter(['DangerLevel'], axis=1)
df_test_target = df_test.filter(['DangerLevel'], axis=1)

In [ ]:
df_train_data.to_csv('varsom_train_data.csv', index_label='index')
df_test_data.to_csv('varsom_test_data.csv', index_label='index')

df_train_target.to_csv('varsom_train_target.csv', index_label='index')
df_test_target.to_csv('varsom_test_target.csv', index_label='index')

In [ ]: