In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pprint as pp
In [2]:
hdfs = pd.HDFStore("../../data/raw/henrik/TestMessungen_NEU.hdf")
In [3]:
hdfs.keys
Out[3]:
In [4]:
df1 = hdfs.get('/x1/t1/trx_1_2')
In [5]:
df1.head(5)
Out[5]:
In [6]:
# Little function to retrieve sender-receiver tuple from df columns
import re
def extract_snd_rcv(df):
regex = r"trx_[1-4]_[1-4]_ifft_[0-9]*"
snd_rcv = {x[4:7] for x in df.columns if re.search(regex, x)}
return [(x[0],x[-1]) for x in snd_rcv]
In [7]:
def get_column_counts(snd_rcv, df):
col_counts = {}
for snd,rcv in snd_rcv:
col_counts['trx_{}_{}_ifft'.format(snd, rcv)] = len([i for i, word in enumerate(list(df.columns)) if word.startswith('trx_{}_{}_ifft'.format(snd, rcv))])
return col_counts
In [8]:
df1_snd_rcv = extract_snd_rcv(df1)
cc = get_column_counts(df1_snd_rcv, df1)
pp.pprint(cc)
print("Sum of measure columns: %i" % sum(cc.values()))
print("# of other columns: %i" % (len(df1.columns) - sum(cc.values())))
In [9]:
[col for col in df1.columns if 'ifft' not in col]
Out[9]:
In [10]:
print(df1['target'].unique())
print("# Unique values in target: %i" % len(df1['target'].unique()))
In [11]:
df2 = hdfs.get('/x1/t1/trx_1_4')
df2.head()
Out[11]:
In [12]:
import re
df2_snd_rcv = extract_snd_rcv(df2)
cc = get_column_counts(df2_snd_rcv, df2)
pp.pprint(cc)
print("Sum of measure columns: %i" % sum(cc.values()))
print("# of other columns: %i" % (len(df2.columns) - sum(cc.values())))
In [13]:
[col for col in df2.columns if 'ifft' not in col]
Out[13]:
In [14]:
print(df2['target'].unique())
print("# Unique values in target: %i" % len(df2['target'].unique()))
In [15]:
plt.figure(figsize=(20, 15))
ax = sns.heatmap(df1.loc[:,'trx_1_2_ifft_0':'trx_1_2_ifft_1999'].values, cmap='nipy_spectral_r')
In [ ]:
In [16]:
plt.figure(figsize=(20, 15))
ax = sns.heatmap(df2.loc[:,'trx_2_4_ifft_0':'trx_2_4_ifft_1999'].values, cmap='YlGnBu')
In [17]:
# Iterating over hdfs data and creating interim data presentation stored in data/interim/henrik/testmessungen_interim.hdf
# Interim data representation contains aditional binary class (binary_target - encoding 0=empty and 1=not empty)
# and multi class target (multi_target - encoding 0-9 for each possible class)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
interim_path = '../../data/interim/henrik/01_testmessungen.hdf'
def binary_mapper(df):
def map_binary(target):
if target.startswith('Empty'):
return 0
else:
return 1
df['binary_target'] = pd.Series(map(map_binary, df['target']))
def multiclass_mapper(df):
le.fit(df['target'])
df['multi_target'] = le.transform(df['target'])
for key in hdfs.keys():
df = hdfs.get(key)
binary_mapper(df)
multiclass_mapper(df)
df.to_hdf(interim_path, key)
hdfs.close()
In [18]:
from evaluation import *
from filters import *
from utility import *
from features import *
In [19]:
hdfs = pd.HDFStore('../../data/interim/henrik/01_testmessungen.hdf')
In [79]:
# generate datasets
tst = ['1','2','3']
tst_ds = []
for t in tst:
df_tst = hdfs.get('/x1/t'+t+'/trx_3_1')
lst = df_tst.columns[df_tst.columns.str.contains('_ifft_')]
#df_tst_cl,_ = distortion_filter(df_tst_cl)
groups = get_trx_groups(df_tst)
df_std = rf_grouped(df_tst, groups=groups, fn=rf_std_single, label='target')
df_mean = rf_grouped(df_tst, groups=groups, fn=rf_mean_single)
df_p2p = rf_grouped(df_tst, groups=groups, fn=rf_ptp_single) # added p2p feature
df_all = pd.concat( [df_std, df_mean, df_p2p], axis=1 ) # added p2p feature
df_all = cf_std_window(df_all, window=4, label='target')
df_tst_sum = generate_class_label_presence(df_all, state_variable='target')
# remove index column
df_tst_sum = df_tst_sum[df_tst_sum.columns.values[~df_tst_sum.columns.str.contains('index')].tolist()]
print('Columns in Dataset:',t)
print(df_tst_sum.columns)
tst_ds.append(df_tst_sum.copy())
In [80]:
# holdout validation
print(hold_out_val(tst_ds, target='target', include_self=False, cl='rf', verbose=False, random_state=1))
In [22]:
hdfs.close()
In [23]:
# Load hdfs data
hdfs = pd.HDFStore("../../data/raw/henrik/TestMessungen_NEU.hdf")
In [24]:
# Check available keys in hdf5 store
hdfs.keys
Out[24]:
In [25]:
# Step-0
# Mapping groundtruth to 0-empty and 1-not empty and prepare for further preprocessing by
# removing additional timestamp columns and index column
# Storing cleaned dataframes (no index, removed _ts columns, mapped multi classes to 0-empty, 1-not empty)
# to new hdfstore to `data/interim/henrik/02_testmessungen.hdf`
hdf_path = "../../data/interim/henrik/02_tesmessungen.hdf"
dfs = []
for key in hdfs.keys():
df = hdfs.get(key)
#df['target'] = df['target'].map(lambda x: 0 if x.startswith("Empty") else 1)
# drop all time stamp columns who endswith _ts
cols = [c for c in df.columns if not c.lower().endswith("ts")]
df = df[cols]
df = df.drop('index', axis=1)
df.to_hdf(hdf_path, key)
hdfs.close()
In [26]:
hdfs = pd.HDFStore(hdf_path)
df = hdfs.get("/x1/t1/trx_1_2")
df.head()
Out[26]:
In [88]:
# Step-1 repeating the previous taks 4 to get a comparable base result with the now dropped _ts and index column to improve from
# generate datasets
from evaluation import *
from filters import *
from utility import *
from features import *
tst = ['1','2','3']
tst_ds = []
for t in tst:
df_tst = hdfs.get('/x1/t'+t+'/trx_3_1')
lst = df_tst.columns[df_tst.columns.str.contains('_ifft_')]
#df_tst_cl,_ = distortion_filter(df_tst_cl)
df_tst,_ = distortion_filter(df_tst)
groups = get_trx_groups(df_tst)
df_std = rf_grouped(df_tst, groups=groups, fn=rf_std_single, label='target')
df_mean = rf_grouped(df_tst, groups=groups, fn=rf_mean_single)
df_p2p = rf_grouped(df_tst, groups=groups, fn=rf_ptp_single) # added p2p feature
df_kurt = rf_grouped(df_tst, groups=groups, fn=rf_kurtosis_single)
df_all = pd.concat( [df_std, df_mean, df_p2p, df_kurt], axis=1 ) # added p2p feature
df_all = cf_std_window(df_all, window=4, label='target')
df_all = cf_diff(df_all, label='target')
df_tst_sum = generate_class_label_presence(df_all, state_variable='target')
# remove index column
# df_tst_sum = df_tst_sum[df_tst_sum.columns.values[~df_tst_sum.columns.str.contains('index')].tolist()]
print('Columns in Dataset:',t)
print(df_tst_sum.columns)
tst_ds.append(df_tst_sum.copy())
In [89]:
print(hold_out_val(tst_ds, target='target', include_self=False, cl='dt', verbose=False, random_state=1))
In [91]:
# Evaluating different supervised learning methods provided in eval.py
# added a NN evaluator but there are some problems regarding usage and hidden layers
# For the moment only kurtosis and cf_diff are added to the dataset as well as the distortion filter
# Feature selection is needed right now!
for elem in ['rf', 'dt', 'nb' ,'nn','knn']:
print(hold_out_val(tst_ds, target='target', include_self=False, cl=elem, verbose=False, random_state=1))
In [ ]:
The following command starts a flask_restful server on localhost port:5444 which answers json post requests. The server is implemented in the file online.py within the ipynb folder and makes use of the final chosen model.
Requests can be made as post request to http://localhost:5444/predict with a json file of the following format:
{ "row": "features" }
be careful that the sent file is valid json. The answer contains the row and the predicted class.
{ "row": "features", "p_class": "predicted class" }
For now the online predictor only predicts the class of single sent lines
In [1]:
%run -i './online.py'
In [ ]: