In [2]:
import pandas as pd
import numpy as np
import json
import logging
#import plotly.offline as py
#py.download_plotlyjs()
#py.init_notebook_mode()
import plotly.plotly as py
import plotly.graph_objs as go
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
In [3]:
In [5]:
df = pd.read_csv(r'D:\Dev\APS\aps\data\varsom\norwegian_avalanche_warnings_season_17_18.csv', index_col=0, parse_dates=['valid_from', 'valid_to', 'date_valid'])
df.info()
In [6]:
df.head(4)
Out[6]:
In [7]:
x = df[df.region_id==3034]['date_valid']
y = df[df.region_id==3034]['danger_level']
data = [
go.Bar(
x=x,
y=y
)
]
py.iplot(data, filename='simple-bar-chart')
Out[7]:
In [43]:
#df[df.region_id==3034]["mountain_weather_temperature_max"] = np.where(df[df.region_id==3034]["mountain_weather_temperature_max"] <-50, np.nan)
#df[df.region_id==3034]["mountain_weather_temperature_max"].replace(df[df.region_id==3034]["mountain_weather_temperature_max"] <-50, 0 ,inplace=True)
x = df[df.region_id==3034]['date_valid']
y1 = df[df.region_id==3034]['mountain_weather_precip_region']
y2 = df[df.region_id==3034]['mountain_weather_temperature_max']#.replace(df[df.region_id==3034]["mountain_weather_temperature_max"] <-50, 0, inplace=True)
data = [
go.Bar(
x=x,
y=y1,
name='Precipitation'
),
go.Scatter(
x=x,
y=y2,
mode = 'lines+markers', # 'lines' / 'markers'
name='Temperature',
yaxis = 'y2'
)
]
layout = go.Layout(
title='Precipitation & Temperature',
yaxis=dict(
title='Precipitation (mm)'
),
yaxis2=dict(
title='Temperature (C)',
titlefont=dict(
color='rgb(148, 103, 189)'
),
tickfont=dict(
color='rgb(148, 103, 189)'
),
overlaying='y',
side='right'
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='combined-line-bar-chart')
Out[43]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [23]:
#aw1718.plot.bar(x='date_valid', y=['avalanche_problem_1_destructive_size_ext_id', 'avalanche_problem_2_destructive_size_ext_id','avalanche_problem_3_destructive_size_ext_id'], ax = ax)
_reg_id = 3034
x = df[df.region_id==_reg_id]['date_valid']
y1 = df[df.region_id==_reg_id]['avalanche_problem_1_destructive_size_ext_id']
y2 = df[df.region_id==_reg_id]['avalanche_problem_2_destructive_size_ext_id']
y3 = df[df.region_id==_reg_id]['avalanche_problem_3_destructive_size_ext_id']
data = [
go.Bar(
x=x,
y=y1,
name='Problem 1'
),
go.Bar(
x=x,
y=y2,
name='Problem 2'
),
go.Bar(
x=x,
y=y3,
name='Problem 3'
)
]
layout = go.Layout(
barmode='group',
title='Grouped Bar with Pandas'
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='pandas-bar-chart-layout')
Out[23]:
In [30]:
# or more compact
df_ = df[df.region_id==3014]
data = []
for i in range(1,4):
data.append(go.Bar(x=df_['date_valid'], y=df_[f'avalanche_problem_{i}_destructive_size_ext_id'], name=f'Problem {i}'))
layout = go.Layout(
barmode = 'stack',
title = f'Avalanche size by avalanche problem\nRegion: {df_.region_name.iloc[0]}'
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='aval_size_per_problem')
Out[30]:
In [40]:
df_ = df[df.region_id==3014]
x = df_['date_valid']
y = ['Problem 1', 'Problem 2', 'Problem 3']
z = []
for i in range(1,4):
z.append(df_[f'avalanche_problem_{i}_destructive_size_ext_id'])
trace = go.Heatmap(x=x, y=y, z=z,
colorscale = 'Viridis')
data=[trace]
py.iplot(data, filename='labelled-heatmap')
Out[40]:
In [44]:
import dateutil.parser as dp
import requests
In [52]:
#!/usr/bin/python
"""
This program shows how to retrieve a time series of observations from the following
combination of source, element and time range:
source: SN18700
element: mean(wind_speed P1D)
time range: 2010-04-01 .. 2010-05-31
The time series is written to standard output as lines of the form:
<observation time as date/time in ISO 8601 format> \
<observation time as seconds since 1970-01-01T00:00:00> \
<observed value>
Save the program to a file example.py, make it executable (chmod 755 example.py),
and run it e.g. like this:
$ CLIENTID=8e6378f7-b3-ae4fe-683f-0db1eb31b24ec ./example.py
(Note: the client ID used in the example should be replaced with a real one)
The program has been tested on the following platforms:
- Python 2.7.3 on Ubuntu 12.04 Precise
- Python 2.7.12 and 3.5.2 on Ubuntu 16.04 Xenial
"""
client_id = '513df91a-2f62-4445-b4ba-0b3a1c4d16bb'
# issue an HTTP GET request
r = requests.get(
'https://frost.met.no/observations/v0.jsonld',
{'sources': 'SN18700', 'elements': 'mean(wind_speed P1D)', 'referencetime': '2018-12-01/2018-12-15'},
auth=(client_id, '')
)
# extract the time series from the response
if r.status_code == 200:
#print(r.json()['data'])
for item in r.json()['data']:
iso8601 = item['referenceTime']
#print(iso8601)
#secsSince1970 = dp.parse(iso8601).strftime('%s')
print('{} {}\n'.format(iso8601, item['observations'][0]['value']))
else:
print('error:\n')
print('\tstatus code: {}\n'.format(r.status_code))
if 'error' in r.json():
assert(r.json()['error']['code'] == r.status_code)
print('\tmessage: {}\n'.format(r.json()['error']['message']))
print('\treason: {}\n'.format(r.json()['error']['reason']))
else:
print('\tother error\n')
In [ ]:
In [ ]:
# Nordvestlandet: Trollheimen, Romsdal, Sunnmøre
warnings, url = gf.get_warnings_as_json([3022, 3023, 3024], "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)
# Østlandet: Jotunheimen, Hallingdal, Vest-Telemark
#warnings, url = gf.get_warnings_as_json([3028, 3032, 3035], "2017-12-01", "2018-05-31", lang_key=2, simple=False, recursive_count=5)
### Use this small data extraction for testing
#warnings, url = gf.get_warnings_as_json([3022], "2018-01-01", "2018-01-15", lang_key=2, simple=False, recursive_count=5)
print(url, '\n\n', type(warnings), len(warnings))
In [ ]:
pprint(warnings)
In [ ]:
# since get_warnings_as_json returns a list, we have to apply "flatten" to each item...
warnings_flattened = (flatten(w, root_keys_to_ignore={'CountyList', 'MunicipalityList'}) for w in warnings)
# TODO: avalanche problems are not labeled correctly by their priority - need to correct.
In [ ]:
df = pd.DataFrame(warnings_flattened)
df.head(5)
In [ ]:
df.info()
In [ ]:
df.describe()
In [ ]:
df.drop_duplicates(keep='first', inplace=True)
In [ ]:
# save the current dataset asa csv
df.to_csv('forecasts_raw.csv', index_label='index')
In [ ]:
# create necessary columns and populate them
error_count = 0
log_file = r'./log/mountain_weather.log'
logging.basicConfig(filename=log_file, level=logging.DEBUG)
for index, row in df.iterrows():
for i in range(5):
for j in range(4):
try:
col_name = '{MWType} {MWSubType}'.format(MWType=row['MountainWeather_MeasurementTypes_{0}_Name'.format(i)], MWSubType=row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Name'.format(i, j)])
col_name = col_name.replace(' ', '_')
if col_name in df.columns.values:
df.loc[index, col_name] = row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]
else:
df[col_name] = np.nan
print('Created column: ', col_name)
df.loc[index, col_name] = row['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]
#print('{MWType} {MWSubType} = {MWSubTypeValue}'.format(MWType=df.loc[index]['MountainWeather_MeasurementTypes_{0}_Name'.format(i)],
# MWSubType=df.loc[index]['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Name'.format(i, j)],
# MWSubTypeValue=df.loc[index]['MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}_Value'.format(i, j)]))
except KeyError:
error_count += 1
logging.info('MountainWeather_MeasurementTypes_{0}_MeasurementSubTypes_{1}... does not exist - moving on.'.format(i, j))
print(f"(Encountered {error_count} KeyError(s) - see log in {log_file})")
In [ ]:
# TODO: remove all MountainWeather_Measurement... columns
# df.columns.values
In [ ]:
In [ ]:
In [ ]:
df[['ValidFrom',
'Temperature_Max',
'Temperature_Min',
'Wind_Direction',
'Wind_Speed']].describe()
In [ ]:
# load json-file with the wanted encoding
with open(r'../config/snoskred_keys.json') as jdata:
snoskred_keys = json.load(jdata)
# print content
pprint(snoskred_keys)
In [ ]:
enc_df = df.copy()
In [ ]:
#print(df['AvalancheProblems_0_AvalPropagationId'].unique(), type(df['AvalancheProblems_0_AvalPropagationId'].unique()[0]))
print(enc_df['Wind_Direction'].unique())
In [ ]:
enc_df['Wind_Direction'].replace({None: 'Not given'}, inplace=True)
In [ ]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
wind_encoded = encoder.fit_transform(enc_df["Wind_Direction"])
for id_, class_ in enumerate(encoder.classes_):
print(f"class id {id_} has label {class_}")
print()
print(f"Encoded wind direction values for first 10 entries: {wind_encoded[:10]}")
In [ ]:
# change wind speeds to numerical values
df['Wind_Speed_Num'] = df['Wind_Speed'].apply(lambda i: snoskred_keys['beaufort_scale_en'][i])
df['Wind_Direction_Num'] = df['Wind_Direction'].apply(lambda i: 0 if i == None else snoskred_keys['wind_dir_conv_en'][i])
# Re-group AvalancheProblemType
# AvalancheProblemType grouped by PWL, wet slab, wet loose, dry loose, storm slab, and wind slab (and glide avalanche).
df['AvalancheProblems_0_Class_AvalancheProblemType_Num'] = df['AvalancheProblems_0_AvalancheProblemTypeId'].apply(lambda i: 0 if i == np.nan else np.int(snoskred_keys['Class_AvalancheProblemTypeId'][str(int(i))]))
df['AvalancheProblems_1_Class_AvalancheProblemType_Num'] = df['AvalancheProblems_1_AvalancheProblemTypeId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalancheProblemTypeId'][str(int(i))]))
# Distribution is labeled _Propagation_ in the API and has five classes. Change name to _AvalDistribution_ and merge the uper three classes into one called _widespread_.
df['AvalancheProblems_0_Class_AvalDistribution_Num'] = df['AvalancheProblems_0_AvalPropagationId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalDistributionId'][str(int(i))]))
df['AvalancheProblems_1_Class_AvalDistribution_Num'] = df['AvalancheProblems_1_AvalPropagationId'].apply(lambda i: 0 if str(i) == str(np.nan) else np.int(snoskred_keys['Class_AvalDistributionId'][str(int(i))]))
In [ ]:
# Keep only columns that hold numerical values.
# AvalCause, AvalancheExt and AvalancheProblemType are directly correlated - keep only re-grouped ..._Class_AvalancheProblemType.
df_numdata = df.filter(['AvalancheProblems_0_AvalProbabilityId',
'AvalancheProblems_0_Class_AvalDistributionId',
'AvalancheProblems_0_AvalTriggerSimpleId',
'AvalancheProblems_0_AvalancheProblemId',
'AvalancheProblems_0_Class_AvalancheProblemTypeId',
'AvalancheProblems_0_AvalancheTypeId',
'AvalancheProblems_0_DestructiveSizeExtId',
'AvalancheProblems_1_AvalProbabilityId',
'AvalancheProblems_1_Class_AvalDistributionId',
'AvalancheProblems_1_AvalTriggerSimpleId',
'AvalancheProblems_1_AvalancheProblemId',
'AvalancheProblems_1_Class_AvalancheProblemTypeId',
'AvalancheProblems_1_AvalancheTypeId',
'AvalancheProblems_1_DestructiveSizeExtId',
'DangerLevel',
'ValidFrom',
'Rainfall_Most_exposed_area',
'Rainfall_Average',
'Wind_Speed_Num',
'Wind_Direction_Num',
'Temperature_Min',
'Temperature_Max',
'Temperature_masl',
'Freezing_Level_masl'], axis=1)
In [ ]:
df_numdata.fillna(0, inplace=True)
In [ ]:
# Check if there are no weired values.
for col in df_numdata.drop(['ValidFrom'], axis=1).columns.values:
print(col, ': ', df_numdata[col].unique())
In [ ]:
df_numdata.hist(bins=50, figsize=(25, 20))
plt.show()
In [ ]:
# Remove all columns without data
#df_numdata = df_numdata.dropna(axis=1)
df_numdata.to_csv('varsom_numdata.csv', index_label='index')
In [ ]:
# Randomly shuffle the index of nba.
random_indices = np.random.permutation(df_numdata.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = np.int(np.floor(len(df_numdata)/3))
print(test_cutoff)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
df_test = df_numdata.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data.
df_train = df_numdata.loc[random_indices[test_cutoff:]]
Separate the target variable in its own dataframe.
In [ ]:
df_train_data = df_train.drop(['DangerLevel'], axis=1)
df_test_data = df_test.drop(['DangerLevel'], axis=1)
df_train_target = df_train.filter(['DangerLevel'], axis=1)
df_test_target = df_test.filter(['DangerLevel'], axis=1)
In [ ]:
df_train_data.to_csv('varsom_train_data.csv', index_label='index')
df_test_data.to_csv('varsom_test_data.csv', index_label='index')
df_train_target.to_csv('varsom_train_target.csv', index_label='index')
df_test_target.to_csv('varsom_test_target.csv', index_label='index')
In [ ]: