Create graphs from varsom data using plotly



In [2]:

    
import pandas as pd
import numpy as np
import json
import logging

#import plotly.offline as py
#py.download_plotlyjs()
#py.init_notebook_mode()
import plotly.plotly as py
import plotly.graph_objs as go
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot



In [3]:



In [5]:

    
df = pd.read_csv(r'D:\Dev\APS\aps\data\varsom\norwegian_avalanche_warnings_season_17_18.csv', index_col=0, parse_dates=['valid_from', 'valid_to', 'date_valid'])
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 3822 entries, 0 to 3821
Columns: 101 entries, author to valid_to
dtypes: datetime64[ns](3), float64(10), int64(46), object(42)
memory usage: 3.0+ MB



In [6]:

    
df.head(4)









    Out[6]:







  
    
      
      author
      avalanche_danger
      avalanche_problem_1_advice
      avalanche_problem_1_cause_id
      avalanche_problem_1_cause_name
      avalanche_problem_1_destructive_size_ext_id
      avalanche_problem_1_destructive_size_ext_name
      avalanche_problem_1_distribution_id
      avalanche_problem_1_distribution_name
      avalanche_problem_1_exposed_height_1
      ...
      region_id
      region_name
      region_type_id
      region_type_name
      snow_surface
      utm_east
      utm_north
      utm_zone
      valid_from
      valid_to
    
    
      index
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      Karsten@NVE
      Det er lite snø og generelt stabile forhold i ...
      Vær varsom  der skredproblemet er å finne i ko...
      15
      Dårlig binding mellom lag i fokksnøen
      2
      2 - Middels
      1
      Få bratte heng
      0
      ...
      3003
      Nordenskiöld Land
      10
      A
      Det er generelt lite snø i terrenget. Rygger e...
      520332
      8663904
      33
      2017-12-01
      2017-12-01 23:59:59
    
    
      1
      jan arild@obskorps
      Kraftig vind fra sørøst vil gi pålagring av fe...
      Vær forsiktig i områder brattere enn 30 grader...
      10
      Nedføyket svakt lag med nysnø
      2
      2 - Middels
      2
      Noen bratte heng
      400
      ...
      3007
      Vest-Finnmark
      10
      A
      Siste dagene har det kommet 25-30 cm snø utsat...
      802123
      7794717
      33
      2017-12-01
      2017-12-01 23:59:59
    
    
      2
      jan arild@obskorps
      Generelle stabile forhold, men vindøkning fra ...
      Vær forsiktig i områder brattere enn 30 grader...
      10
      Nedføyket svakt lag med nysnø
      2
      2 - Middels
      2
      Noen bratte heng
      400
      ...
      3009
      Nord-Troms
      10
      A
      Siste dagene har det kommet 20-30 cm snø utsat...
      750984
      7742562
      33
      2017-12-01
      2017-12-01 23:59:59
    
    
      3
      jan arild@obskorps
      Vindøkning fra sør vil gi pålagring av fersk f...
      Vær forsiktig i områder brattere enn 30 grader...
      10
      Nedføyket svakt lag med nysnø
      2
      2 - Middels
      2
      Noen bratte heng
      400
      ...
      3010
      Lyngen
      10
      A
      Siste dagene har det kommet 10-20 cm snø utsat...
      692056
      7719872
      33
      2017-12-01
      2017-12-01 23:59:59
    
  

4 rows × 101 columns



In [7]:

    
x = df[df.region_id==3034]['date_valid']
y = df[df.region_id==3034]['danger_level']
data = [
    go.Bar(
        x=x,
        y=y
    )
]
py.iplot(data, filename='simple-bar-chart')









    Out[7]:



In [43]:

    
#df[df.region_id==3034]["mountain_weather_temperature_max"] = np.where(df[df.region_id==3034]["mountain_weather_temperature_max"] <-50, np.nan)
#df[df.region_id==3034]["mountain_weather_temperature_max"].replace(df[df.region_id==3034]["mountain_weather_temperature_max"] <-50, 0 ,inplace=True)

x = df[df.region_id==3034]['date_valid']
y1 = df[df.region_id==3034]['mountain_weather_precip_region']
y2 = df[df.region_id==3034]['mountain_weather_temperature_max']#.replace(df[df.region_id==3034]["mountain_weather_temperature_max"] <-50, 0, inplace=True)
data = [
    go.Bar(
        x=x,
        y=y1,
        name='Precipitation'
    ),
    go.Scatter(
        x=x,
        y=y2,
        mode = 'lines+markers',  # 'lines' / 'markers'
        name='Temperature',
        yaxis = 'y2'
    )
]

layout = go.Layout(
    title='Precipitation & Temperature',
    yaxis=dict(
        title='Precipitation (mm)'
    ),
    yaxis2=dict(
        title='Temperature (C)',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='combined-line-bar-chart')









    Out[43]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [23]:

    
#aw1718.plot.bar(x='date_valid', y=['avalanche_problem_1_destructive_size_ext_id', 'avalanche_problem_2_destructive_size_ext_id','avalanche_problem_3_destructive_size_ext_id'], ax = ax)
_reg_id = 3034

x = df[df.region_id==_reg_id]['date_valid']
y1 = df[df.region_id==_reg_id]['avalanche_problem_1_destructive_size_ext_id']
y2 = df[df.region_id==_reg_id]['avalanche_problem_2_destructive_size_ext_id']
y3 = df[df.region_id==_reg_id]['avalanche_problem_3_destructive_size_ext_id']

data = [
    go.Bar(
        x=x,
        y=y1,
        name='Problem 1'
    ),
    go.Bar(
        x=x,
        y=y2,
        name='Problem 2'
    ),
    go.Bar(
        x=x,
        y=y3,
        name='Problem 3'
    )
]

layout = go.Layout(
    barmode='group',
    title='Grouped Bar with Pandas'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='pandas-bar-chart-layout')









    Out[23]:



In [30]:

    
# or more compact
df_ = df[df.region_id==3014]
data = []
for i in range(1,4):
    data.append(go.Bar(x=df_['date_valid'], y=df_[f'avalanche_problem_{i}_destructive_size_ext_id'], name=f'Problem {i}'))

layout = go.Layout(
    barmode = 'stack',
    title = f'Avalanche size by avalanche problem\nRegion: {df_.region_name.iloc[0]}'
    )

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='aval_size_per_problem')









    



C:\Anaconda3\lib\site-packages\plotly\tools.py:1424: UserWarning:

Looks like you used a newline character: '\n'.

Plotly uses a subset of HTML escape characters
to do things like newline (<br>), bold (<b></b>),
italics (<i></i>), etc. Your newline characters 
have been converted to '<br>' so they will show 
up right on your Plotly figure!







    Out[30]:



In [40]:

    
df_ = df[df.region_id==3014]
x = df_['date_valid']
y = ['Problem 1', 'Problem 2', 'Problem 3']
z = []
for i in range(1,4):
    z.append(df_[f'avalanche_problem_{i}_destructive_size_ext_id'])

trace = go.Heatmap(x=x, y=y, z=z,
                  colorscale = 'Viridis')
data=[trace]
py.iplot(data, filename='labelled-heatmap')









    Out[40]:

frost.met.no example - Get a time series



In [44]:

    
import dateutil.parser as dp
import requests



In [52]:

    
#!/usr/bin/python

"""

This program shows how to retrieve a time series of observations from the following
combination of source, element and time range:

source:     SN18700
element:    mean(wind_speed P1D)
time range: 2010-04-01 .. 2010-05-31

The time series is written to standard output as lines of the form:

  <observation time as date/time in ISO 8601 format> \
  <observation time as seconds since 1970-01-01T00:00:00> \
  <observed value>

Save the program to a file example.py, make it executable (chmod 755 example.py),
and run it e.g. like this:

  $ CLIENTID=8e6378f7-b3-ae4fe-683f-0db1eb31b24ec ./example.py

(Note: the client ID used in the example should be replaced with a real one)

The program has been tested on the following platforms:
  - Python 2.7.3 on Ubuntu 12.04 Precise
  - Python 2.7.12 and 3.5.2 on Ubuntu 16.04 Xenial

"""
client_id = '513df91a-2f62-4445-b4ba-0b3a1c4d16bb'

# issue an HTTP GET request
r = requests.get(
    'https://frost.met.no/observations/v0.jsonld',
    {'sources': 'SN18700', 'elements': 'mean(wind_speed P1D)', 'referencetime': '2018-12-01/2018-12-15'},
    auth=(client_id, '')
)

# extract the time series from the response
if r.status_code == 200:
    #print(r.json()['data'])
    for item in r.json()['data']:
        iso8601 = item['referenceTime']
        #print(iso8601)
        #secsSince1970 = dp.parse(iso8601).strftime('%s')
        print('{} {}\n'.format(iso8601, item['observations'][0]['value']))
else:
    print('error:\n')
    print('\tstatus code: {}\n'.format(r.status_code))
    if 'error' in r.json():
        assert(r.json()['error']['code'] == r.status_code)
        print('\tmessage: {}\n'.format(r.json()['error']['message']))
        print('\treason: {}\n'.format(r.json()['error']['reason']))
    else:
        print('\tother error\n')









    



2018-12-01T00:00:00.000Z 3.2

2018-12-02T00:00:00.000Z 1.8

2018-12-03T00:00:00.000Z 2.3

2018-12-04T00:00:00.000Z 3.5

2018-12-05T00:00:00.000Z 1.7

2018-12-06T00:00:00.000Z 2.6

2018-12-07T00:00:00.000Z 2.9

2018-12-08T00:00:00.000Z 2

2018-12-09T00:00:00.000Z 0.9

2018-12-10T00:00:00.000Z 3.8

2018-12-11T00:00:00.000Z 3.9

2018-12-12T00:00:00.000Z 1.7

2018-12-13T00:00:00.000Z 2.8

2018-12-14T00:00:00.000Z 2.2



In [ ]:



In [ ]:

    
# Nordvestlandet: Trollheimen, Romsdal, Sunnmøre
warnings, url = gf.get_warnings_as_json([3022, 3023, 3024], "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)

# Østlandet: Jotunheimen, Hallingdal, Vest-Telemark
#warnings, url = gf.get_warnings_as_json([3028, 3032, 3035], "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)


### Use this small data extraction for testing
#warnings, url = gf.get_warnings_as_json([3022], "2018-01-01", "2018-01-15", lang_key=2, simple=False, recursive_count=5)

print(url, '\n\n', type(warnings), len(warnings))



In [ ]:

    
pprint(warnings)



In [ ]:

    
# since get_warnings_as_json returns a list, we have to apply "flatten" to each item... 
warnings_flattened = (flatten(w, root_keys_to_ignore={'CountyList', 'MunicipalityList'}) for w in warnings)
# TODO: avalanche problems are not labeled correctly by their priority - need to correct.



In [ ]:

    
df = pd.DataFrame(warnings_flattened)
df.head(5)



In [ ]:

    
df.info()



In [ ]:

    
df.describe()



In [ ]:

    
df.drop_duplicates(keep='first', inplace=True)



In [ ]:

    
# save the current dataset asa csv
df.to_csv('forecasts_raw.csv', index_label='index')

Start pre-processing



In [ ]:

    
# create necessary columns and populate them
error_count = 0
log_file = r'./log/mountain_weather.log'
logging.basicConfig(filename=log_file, level=logging.DEBUG)
for index, row in df.iterrows():
    for i in range(5):
        for j in range(4):
            try:
                col_name = '{MWType} {MWSubType}'.format(MWType=row['MountainWeather_MeasurementTypes_{0}_Name'.format(i)], MWSubType=row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Name'.format(i, j)])
                col_name = col_name.replace(' ', '_')
                if col_name in df.columns.values:
                    df.loc[index, col_name] = row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]
                else:
                    df[col_name] = np.nan
                    print('Created column: ', col_name)
                    df.loc[index, col_name] = row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]
                    
                #print('{MWType} {MWSubType} = {MWSubTypeValue}'.format(MWType=df.loc[index]['MountainWeather_MeasurementTypes_{0}_Name'.format(i)],
                 #                                                      MWSubType=df.loc[index]['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Name'.format(i, j)],
                  #                                                     MWSubTypeValue=df.loc[index]['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]))
            except KeyError:
                error_count += 1
                logging.info('MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}... does not exist - moving on.'.format(i, j))
print(f"(Encountered {error_count} KeyError(s) - see log in {log_file})")



In [ ]:

    
# TODO: remove all MountainWeather_Measurement... columns
# df.columns.values



In [ ]:



In [ ]:



In [ ]:

    
df[['ValidFrom',
    'Temperature_Max',
    'Temperature_Min',
    'Wind_Direction',
    'Wind_Speed']].describe()

Encoding categorial attributes



In [ ]:

    
# load json-file with the wanted encoding
with open(r'../config/snoskred_keys.json') as jdata:
    snoskred_keys = json.load(jdata)

# print content
pprint(snoskred_keys)



In [ ]:

    
enc_df = df.copy()



In [ ]:

    
#print(df['AvalancheProblems_0_AvalPropagationId'].unique(), type(df['AvalancheProblems_0_AvalPropagationId'].unique()[0]))
print(enc_df['Wind_Direction'].unique())



In [ ]:

    
enc_df['Wind_Direction'].replace({None: 'Not given'}, inplace=True)



In [ ]:

    
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
wind_encoded = encoder.fit_transform(enc_df["Wind_Direction"])

for id_, class_ in enumerate(encoder.classes_):
    print(f"class id {id_} has label {class_}")

print()
print(f"Encoded wind direction values for first 10 entries: {wind_encoded[:10]}")



In [ ]:

    
# change wind speeds to numerical values
df['Wind_Speed_Num'] = df['Wind_Speed'].apply(lambda i: snoskred_keys['beaufort_scale_en'][i])
df['Wind_Direction_Num'] = df['Wind_Direction'].apply(lambda i: 0 if i == None else snoskred_keys['wind_dir_conv_en'][i])

# Re-group AvalancheProblemType
# AvalancheProblemType grouped by PWL, wet slab, wet loose, dry loose, storm slab, and wind slab (and glide avalanche).
df['AvalancheProblems_0_Class_AvalancheProblemType_Num'] = df['AvalancheProblems_0_AvalancheProblemTypeId'].apply(lambda i: 0 if i == np.nan else np.int(snoskred_keys['Class_AvalancheProblemTypeId'][str(int(i))]))
df['AvalancheProblems_1_Class_AvalancheProblemType_Num'] = df['AvalancheProblems_1_AvalancheProblemTypeId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalancheProblemTypeId'][str(int(i))]))

# Distribution is labeled _Propagation_ in the API and has five classes. Change name to _AvalDistribution_ and merge the uper three classes into one called _widespread_.
df['AvalancheProblems_0_Class_AvalDistribution_Num'] = df['AvalancheProblems_0_AvalPropagationId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalDistributionId'][str(int(i))]))
df['AvalancheProblems_1_Class_AvalDistribution_Num'] = df['AvalancheProblems_1_AvalPropagationId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalDistributionId'][str(int(i))]))



In [ ]:

    
# Keep only columns that hold numerical values.
# AvalCause, AvalancheExt and AvalancheProblemType are directly correlated - keep only re-grouped ..._Class_AvalancheProblemType.
df_numdata = df.filter(['AvalancheProblems_0_AvalProbabilityId',
                        'AvalancheProblems_0_Class_AvalDistributionId',
                        'AvalancheProblems_0_AvalTriggerSimpleId',
                        'AvalancheProblems_0_AvalancheProblemId',
                        'AvalancheProblems_0_Class_AvalancheProblemTypeId',
                        'AvalancheProblems_0_AvalancheTypeId',
                        'AvalancheProblems_0_DestructiveSizeExtId',
                        'AvalancheProblems_1_AvalProbabilityId',
                        'AvalancheProblems_1_Class_AvalDistributionId',
                        'AvalancheProblems_1_AvalTriggerSimpleId',
                        'AvalancheProblems_1_AvalancheProblemId',
                        'AvalancheProblems_1_Class_AvalancheProblemTypeId',
                        'AvalancheProblems_1_AvalancheTypeId',
                        'AvalancheProblems_1_DestructiveSizeExtId',
                        'DangerLevel',
                        'ValidFrom',
                        'Rainfall_Most_exposed_area',
                        'Rainfall_Average',
                        'Wind_Speed_Num',
                        'Wind_Direction_Num',
                        'Temperature_Min',
                        'Temperature_Max',
                        'Temperature_masl',
                        'Freezing_Level_masl'], axis=1)



In [ ]:

    
df_numdata.fillna(0, inplace=True)



In [ ]:

    
# Check if there are no weired values.
for col in df_numdata.drop(['ValidFrom'], axis=1).columns.values:
    print(col, ': ', df_numdata[col].unique())



In [ ]:

    
df_numdata.hist(bins=50, figsize=(25, 20))
plt.show()



In [ ]:

    
# Remove all columns without data
#df_numdata = df_numdata.dropna(axis=1)

df_numdata.to_csv('varsom_numdata.csv', index_label='index')

Split data into a training- and test-dataset

Randomly choose indicies that should serve a test data and which are removed from the training data. Maybe write a function that chooses a certain % as test dxata



In [ ]:

    
# Randomly shuffle the index of nba.
random_indices = np.random.permutation(df_numdata.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = np.int(np.floor(len(df_numdata)/3))
print(test_cutoff)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
df_test = df_numdata.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data.
df_train = df_numdata.loc[random_indices[test_cutoff:]]

Separate the target variable in its own dataframe.



In [ ]:

    
df_train_data = df_train.drop(['DangerLevel'], axis=1)
df_test_data = df_test.drop(['DangerLevel'], axis=1)

df_train_target = df_train.filter(['DangerLevel'], axis=1)
df_test_target = df_test.filter(['DangerLevel'], axis=1)



In [ ]:

    
df_train_data.to_csv('varsom_train_data.csv', index_label='index')
df_test_data.to_csv('varsom_test_data.csv', index_label='index')

df_train_target.to_csv('varsom_train_target.csv', index_label='index')
df_test_target.to_csv('varsom_test_target.csv', index_label='index')



In [ ]:

	author	avalanche_danger	avalanche_problem_1_advice	avalanche_problem_1_cause_id	avalanche_problem_1_cause_name	avalanche_problem_1_destructive_size_ext_id	avalanche_problem_1_destructive_size_ext_name	avalanche_problem_1_distribution_id	avalanche_problem_1_distribution_name	avalanche_problem_1_exposed_height_1	...	region_id	region_name	region_type_id	region_type_name	snow_surface	utm_east	utm_north	utm_zone	valid_from	valid_to
index
0	Karsten@NVE	Det er lite snø og generelt stabile forhold i ...	Vær varsom der skredproblemet er å finne i ko...	15	Dårlig binding mellom lag i fokksnøen	2	2 - Middels	1	Få bratte heng	0	...	3003	Nordenskiöld Land	10	A	Det er generelt lite snø i terrenget. Rygger e...	520332	8663904	33	2017-12-01	2017-12-01 23:59:59
1	jan arild@obskorps	Kraftig vind fra sørøst vil gi pålagring av fe...	Vær forsiktig i områder brattere enn 30 grader...	10	Nedføyket svakt lag med nysnø	2	2 - Middels	2	Noen bratte heng	400	...	3007	Vest-Finnmark	10	A	Siste dagene har det kommet 25-30 cm snø utsat...	802123	7794717	33	2017-12-01	2017-12-01 23:59:59
2	jan arild@obskorps	Generelle stabile forhold, men vindøkning fra ...	Vær forsiktig i områder brattere enn 30 grader...	10	Nedføyket svakt lag med nysnø	2	2 - Middels	2	Noen bratte heng	400	...	3009	Nord-Troms	10	A	Siste dagene har det kommet 20-30 cm snø utsat...	750984	7742562	33	2017-12-01	2017-12-01 23:59:59
3	jan arild@obskorps	Vindøkning fra sør vil gi pålagring av fersk f...	Vær forsiktig i områder brattere enn 30 grader...	10	Nedføyket svakt lag med nysnø	2	2 - Middels	2	Noen bratte heng	400	...	3010	Lyngen	10	A	Siste dagene har det kommet 10-20 cm snø utsat...	692056	7719872	33	2017-12-01	2017-12-01 23:59:59