In [3]:
# imports
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pprint as pp
Wir öffnen die Datenbank und lassen uns die Keys der einzelnen Tabellen ausgeben.
In [ ]:
hdf = pd.HDFStore('../../data/raw/TestMessungen_NEU.hdf')
print(hdf.keys)
Wir laden den Frame x1_t1_trx_1_4 und betrachten seine Dimension.
In [ ]:
df_x1_t1_trx_1_4 = hdf.get('/x1/t1/trx_1_4')
print("Rows:", df_x1_t1_trx_1_4.shape[0])
print("Columns:", df_x1_t1_trx_1_4.shape[1])
Als nächstes Untersuchen wir exemplarisch für zwei Empfänger-Sender-Gruppen die Attributzusammensetzung.
In [ ]:
# first inspection of columns from df_x1_t1_trx_1_4
df_x1_t1_trx_1_4.head(5)
Für die Analyse der Frames definieren wir einige Hilfsfunktionen.
In [ ]:
# Little function to retrieve sender-receiver tuples from df columns
def extract_snd_rcv(df):
regex = r"trx_[1-4]_[1-4]"
# creates a set containing the different pairs
snd_rcv = {x[4:7] for x in df.columns if re.search(regex, x)}
return [(x[0],x[-1]) for x in snd_rcv]
In [ ]:
# Sums the number of columns for each sender-receiver tuple
def get_column_counts(snd_rcv, df):
col_counts = {}
for snd,rcv in snd_rcv:
col_counts['Columns for pair {} {}:'.format(snd, rcv)] = len([i for i, word in enumerate(list(df.columns)) if word.startswith('trx_{}_{}'.format(snd, rcv))])
return col_counts
In [ ]:
# Analyze the column composition of a given measurement.
def analyse_columns(df):
df_snd_rcv = extract_snd_rcv(df)
cc = get_column_counts(df_snd_rcv, df)
for x in cc:
print(x, cc[x])
print("Sum of pair related columns: %i" % sum(cc.values()))
print()
print("Other columns are:")
for att in [col for col in df.columns if 'ifft' not in col and 'ts' not in col]:
print(att)
In [ ]:
# Analyze the values of the target column.
def analyze_target(df):
print(df['target'].unique())
print("# Unique values in target: %i" % len(df['target'].unique()))
Bestimme nun die Spaltezusammensetzung von df_x1_t1_trx_1_4.
In [ ]:
analyse_columns(df_x1_t1_trx_1_4)
Betrachte den Inhalt der "target"-Spalte von df_x1_t1_trx_1_4.
In [ ]:
analyze_target(df_x1_t1_trx_1_4)
Als nächstes laden wir den Frame x3_t2_trx_3_1 und betrachten seine Dimension.
In [ ]:
df_x3_t2_trx_3_1 = hdf.get('/x3/t2/trx_3_1')
print("Rows:", df_x3_t2_trx_3_1.shape[0])
print("Columns:", df_x3_t2_trx_3_1.shape[1])
Gefolgt von einer Analyse seiner Spaltenzusammensetzung und seiner "target"-Werte.
In [ ]:
analyse_columns(df_x3_t2_trx_3_1)
In [ ]:
analyze_target(df_x3_t2_trx_3_1)
Frage: Was stellen Sie bzgl. der „Empfänger-Nummer_Sender-Nummer“-Kombinationen fest? Sind diese gleich? Welche Ausprägungen finden Sie in der Spalte „target“?
Antwort: Wir sehen, wenn jeweils ein Paar sendet, hören die anderen beiden Sender zu und messen ihre Verbindung zu den gerade sendenden Knoten (d.h. 6 Paare in jedem Dataframe). Sendet z.B. das Paar 3 1, so misst Knoten 1 die Verbindung 1-3, Knoten 3 die Verbindung 3-1 und Knoten 2 und 4 Verbindung 2-1 und 2-3 bzw. 4-1 und 4-3. Die 10 verschiedenen Ausprägungen der Spalte "target" sind oben zu sehen.
Wir visualisieren die Rohdaten mit verschiedenen Heatmaps, um so die Integrität der Daten optisch zu validieren und Ideen für mögliche Features zu entwickeln. Hier stellen wir exemplarisch die Daten von Frame df_x1_t1_trx_1_4 dar.
In [ ]:
vals = df_x1_t1_trx_1_4.loc[:,'trx_2_4_ifft_0':'trx_2_4_ifft_1999'].values
In [ ]:
# one big heatmap
plt.figure(figsize=(14, 12))
plt.title('trx_2_4_ifft')
plt.xlabel("ifft of frequency")
plt.ylabel("measurement")
ax = sns.heatmap(vals, xticklabels=200, yticklabels=20, vmin=0, vmax=1, cmap='nipy_spectral_r')
plt.show()
Wir betrachten wie verschiedene Farbschemata unterschiedliche Merkmale unserer Rohdaten hervorheben.
In [ ]:
# compare different heatmaps
plt.figure(1, figsize=(12,10))
# nipy_spectral_r scheme
plt.subplot(221)
plt.title('trx_2_4_ifft')
plt.xlabel("ifft of frequency")
plt.ylabel("measurement")
ax = sns.heatmap(vals, xticklabels=200, yticklabels=20, vmin=0, vmax=1, cmap='nipy_spectral_r')
# terrain scheme
plt.subplot(222)
plt.title('trx_2_4_ifft')
plt.xlabel("ifft of frequency")
plt.ylabel("measurement")
ax = sns.heatmap(vals, xticklabels=200, yticklabels=20, vmin=0, vmax=1, cmap='terrain')
# Vega10 scheme
plt.subplot(223)
plt.title('trx_2_4_ifft')
plt.xlabel("ifft of frequency")
plt.ylabel("measurement")
ax = sns.heatmap(vals, xticklabels=200, yticklabels=20, vmin=0, vmax=1, cmap='Vega10')
# Wistia scheme
plt.subplot(224)
plt.title('trx_2_4_ifft')
plt.xlabel("ifft of frequency")
plt.ylabel("measurement")
ax = sns.heatmap(vals, xticklabels=200, yticklabels=20, vmin=0, vmax=1, cmap='Wistia')
# Adjust the subplot layout, because the logit one may take more space
# than usual, due to y-tick labels like "1 - 10^{-3}"
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
wspace=0.2)
plt.show()
In [ ]:
# Iterating over hdf data and creating interim data presentation stored in data/interim/testmessungen_interim.hdf
# Interim data representation contains aditional binary class (binary_target - encoding 0=empty and 1=not empty)
# and multi class target (multi_target - encoding 0-9 for each possible class)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
interim_path = '../../data/interim/01_testmessungen.hdf'
def binary_mapper(df):
def map_binary(target):
if target.startswith('Empty'):
return 0
else:
return 1
df['binary_target'] = pd.Series(map(map_binary, df['target']))
def multiclass_mapper(df):
le.fit(df['target'])
df['multi_target'] = le.transform(df['target'])
for key in hdf.keys():
df = hdf.get(key)
binary_mapper(df)
multiclass_mapper(df)
df.to_hdf(interim_path, key)
hdf.close()
Überprüfe neu beschrifteten Dataframe „/x1/t1/trx_3_1“ verwenden. Wir erwarten als Ergebnisse für 5 zu Beginn des Experiments „Empty“ (bzw. 0) und für 120 mitten im Experiment „Not Empty“ (bzw. 1).
In [ ]:
hdf = pd.HDFStore('../../data/interim/01_testmessungen.hdf')
df_x1_t1_trx_3_1 = hdf.get('/x1/t1/trx_3_1')
print("binary_target for measurement 5:", df_x1_t1_trx_3_1['binary_target'][5])
print("binary_target for measurement 120:", df_x1_t1_trx_3_1['binary_target'][120])
hdf.close()
Wir folgen den Schritten in Aufgabe 4 und testen einen einfachen Erkenner.
In [ ]:
from evaluation import *
from filters import *
from utility import *
from features import *
In [ ]:
# raw data to achieve target values
hdf = pd.HDFStore('../../data/raw/TestMessungen_NEU.hdf')
In [ ]:
# generate datasets
tst = ['1','2','3']
tst_ds = []
for t in tst:
df_tst = hdf.get('/x1/t'+t+'/trx_3_1')
lst = df_tst.columns[df_tst.columns.str.contains('_ifft_')]
#df_tst_cl,_ = distortion_filter(df_tst_cl)
groups = get_trx_groups(df_tst)
df_std = rf_grouped(df_tst, groups=groups, fn=rf_std_single, label='target')
df_mean = rf_grouped(df_tst, groups=groups, fn=rf_mean_single)
df_p2p = rf_grouped(df_tst, groups=groups, fn=rf_ptp_single) # added p2p feature
df_all = pd.concat( [df_std, df_mean, df_p2p], axis=1 ) # added p2p feature
df_all = cf_std_window(df_all, window=4, label='target')
df_tst_sum = generate_class_label_presence(df_all, state_variable='target')
# remove index column
df_tst_sum = df_tst_sum[df_tst_sum.columns.values[~df_tst_sum.columns.str.contains('index')].tolist()]
print('Columns in Dataset:',t)
print(df_tst_sum.columns)
tst_ds.append(df_tst_sum.copy())
In [ ]:
# holdout validation
print(hold_out_val(tst_ds, target='target', include_self=False, cl='rf', verbose=False, random_state=1))
In [ ]:
hdf.close()
Für die Konstruktion eines eigenen Erkenners führen wir die entsprechenden Preprocessing und Mapping Schritte ausgehend von den Roddaten erneut durch und passen diese unseren Bedürfnissen an.# Load hdfs data hdfs = pd.HDFStore("../../data/raw/henrik/TestMessungen_NEU.hdf")
In [3]:
# Load raw data
hdf = pd.HDFStore("../../data/raw/TestMessungen_NEU.hdf")
In [ ]:
# Check available keys in hdf store
print(hdf.keys)
Zuerst passen wir die Groundtruth-Label an, entfernen Zeitstempel sowie Zeilenindices und speichern die resultierenden Frames ab.
In [4]:
hdf_path = "../../data/interim/02_tesmessungen.hdf"
In [10]:
# Mapping groundtruth to 0-empty and 1-not empty and prepare for further preprocessing by
# removing additional timestamp columns and index column
# Storing cleaned dataframes (no index, removed _ts columns, mapped multi classes to 0-empty, 1-not empty)
# to new hdfstore to `data/interim/02_testmessungen.hdf`
dfs = []
for key in hdf.keys():
df = hdf.get(key)
#df['target'] = df['target'].map(lambda x: 0 if x.startswith("Empty") else 1)
# drop all time stamp columns who endswith _ts
cols = [c for c in df.columns if not c.lower().endswith("ts")]
df = df[cols]
df = df.drop('Timestamp', axis=1)
df = df.drop('index', axis=1)
df.to_hdf(hdf_path, key)
hdf.close()
Wir sehen, dass nur noch die 6 x 2000 Messungen für die jeweiligen Paare sowie die 'target'-Werte in den resultierenden Frames enthalten sind.
In [5]:
hdf = pd.HDFStore(hdf_path)
df = hdf.get("/x1/t1/trx_1_2")
df.head()
Out[5]:
In [6]:
# Step-1 repeating the previous taks 4 to get a comparable base result with the now dropped _ts and index column to improve from
# generate datasets
from evaluation import *
from filters import *
from utility import *
from features import *
def prepare_features(c, p):
tst = ['1','2','3']
tst_ds = []
for t in tst:
df_tst = hdf.get('/x'+c+'/t'+t+'/trx_'+p)
lst = df_tst.columns[df_tst.columns.str.contains('_ifft_')]
#df_tst_cl,_ = distortion_filter(df_tst_cl)
df_tst,_ = distortion_filter(df_tst)
groups = get_trx_groups(df_tst)
df_std = rf_grouped(df_tst, groups=groups, fn=rf_std_single, label='target')
df_mean = rf_grouped(df_tst, groups=groups, fn=rf_mean_single)
df_p2p = rf_grouped(df_tst, groups=groups, fn=rf_ptp_single) # added p2p feature
df_kurt = rf_grouped(df_tst, groups=groups, fn=rf_kurtosis_single)
df_all = pd.concat( [df_std, df_mean, df_p2p, df_kurt], axis=1 ) # added p2p feature
df_all = cf_std_window(df_all, window=4, label='target')
df_all = cf_diff(df_all, label='target')
df_tst_sum = generate_class_label_presence(df_all, state_variable='target')
# remove index column
df_tst_sum = df_tst_sum[df_tst_sum.columns.values[~df_tst_sum.columns.str.contains('index')].tolist()]
# print('Columns in Dataset:',t)
# print(df_tst_sum.columns)
tst_ds.append(df_tst_sum.copy())
return tst_ds
tst_ds = prepare_features(c='1', p='3_1')
In [7]:
# Evaluating different supervised learning methods provided in eval.py
# added a NN evaluator but there are some problems regarding usage and hidden layers
# For the moment only kurtosis and cf_diff are added to the dataset as well as the distortion filter
# Feature selection is needed right now!
for elem in ['rf', 'dt', 'nb' ,'nn','knn']:
print(elem, ":", hold_out_val(tst_ds, target='target', include_self=False, cl=elem, verbose=False, random_state=1))
In [8]:
# extra column features generated and reduced with PCA
from evaluation import *
from filters import *
from utility import *
from features import *
from new_features import *
def prepare_features_PCA_cf(c, p):
tst = ['1','2','3']
tst_ds = []
for t in tst:
df_tst = hdf.get('/x'+c+'/t'+t+'/trx_'+p)
lst = df_tst.columns[df_tst.columns.str.contains('_ifft_')]
df_tst,_ = distortion_filter(df_tst)
groups = get_trx_groups(df_tst)
df_cf_mean = reduce_dim_PCA(cf_mean_window(df_tst, window=3, column_key="ifft", label=None ).fillna(0), n_comps=10)
#df_cf_std = reduce_dim_PCA(cf_std_window(df_tst, window=3, column_key="ifft", label=None ).fillna(0), n_comps=10)
df_cf_ptp = reduce_dim_PCA(cf_ptp(df_tst, window=3, column_key="ifft", label=None ).fillna(0), n_comps=10)
#df_cf_kurt = reduce_dim_PCA(cf_kurt(df_tst, window=3, column_key="ifft", label=None ).fillna(0), n_comps=10)
#df_std = rf_grouped(df_tst, groups=groups, fn=rf_std_single)
df_mean = rf_grouped(df_tst, groups=groups, fn=rf_mean_single, label='target')
df_p2p = rf_grouped(df_tst, groups=groups, fn=rf_ptp_single) # added p2p feature
df_kurt = rf_grouped(df_tst, groups=groups, fn=rf_kurtosis_single)
df_skew = rf_grouped(df_tst, groups=groups, fn=rf_skew_single)
df_all = pd.concat( [df_mean, df_p2p, df_kurt, df_skew], axis=1 )
df_all = cf_std_window(df_all, window=4, label='target')
df_all = cf_diff(df_all, label='target')
df_all = reduce_dim_PCA(df_all.fillna(0), n_comps=10, label='target')
df_all = pd.concat( [df_all, df_cf_mean, df_cf_ptp], axis=1)
df_tst_sum = generate_class_label_presence(df_all, state_variable='target')
# remove index column
df_tst_sum = df_tst_sum[df_tst_sum.columns.values[~df_tst_sum.columns.str.contains('index')].tolist()]
#print('Columns in Dataset:',t)
#print(df_tst_sum.columns)
tst_ds.append(df_tst_sum.copy())
return tst_ds
tst_ds_PCA = prepare_features_PCA_cf(c='1', p='3_1')
In [9]:
# Evaluating different supervised learning methods provided in eval.py
# We can see that the column features have increased F1 score of the classifiers
# Best score for Naive Bayes
for elem in ['rf', 'dt', 'nb' ,'nn','knn']:
print(elem, ":", hold_out_val(tst_ds_PCA, target='target', include_self=False, cl=elem, verbose=False, random_state=1))
In [10]:
def evaluate_models(ds):
res = {}
for elem in ['rf', 'dt', 'nb' ,'nn','knn']:
res[elem] = hold_out_val(ds, target='target', include_self=False, cl=elem, verbose=False, random_state=1)
return res
def evaluate_performance(c, p):
# include a prepare data function?
ds = prepare_features(c, p)
return evaluate_models(ds)
def evaluate_performance_PCA_cf(c, p):
# include a prepare data function?
ds = prepare_features_PCA_cf(c, p)
return evaluate_models(ds)
In [12]:
config = ['1','2','3','4']
pairing = ['1_2','1_4','2_3','3_1','3_4','4_2']
tst_ds = []
res_all = []
for c in config:
print("Testing for configuration", c)
for p in pairing:
print("Analyse performance for pairing", p)
res = evaluate_performance(c, p)
res_all.append(res)
# TODO draw graph
for model in res:
print(model, res[model])
In [13]:
all_keys = set().union(*(d.keys() for d in res_all))
print(all_keys)
print("results for prepare_features() function")
for key in all_keys:
print("mean F1 for {}: {}".format(key, sum(item[key][0] for item in res_all)/len(res_all)))
In [16]:
config = ['1','2','3','4']
pairing = ['1_2','1_4','2_3','3_1','3_4','4_2']
tst_ds = []
res_all_PCA = []
for c in config:
print("Testing for configuration", c)
for p in pairing:
print("Analyse performance for pairing", p)
res = evaluate_performance_PCA_cf(c, p)
res_all_PCA.append(res)
# TODO draw graph
for model in res:
print(model, res[model])
In [18]:
all_keys = set().union(*(d.keys() for d in res_all_PCA))
print(all_keys)
print("results for prepare_features_PCA_cf() function")
for key in all_keys:
print("mean F1 for {}: {}".format(key, sum(item[key][0] for item in res_all_PCA)/len(res_all_PCA)))
In [ ]:
from sklearn.externals import joblib
joblib.dump(res['dt'], '../../models/solution_ueb02/model.plk')
The following command starts a flask_restful server on localhost port:5444 which answers json post requests. The server is implemented in the file online.py within the ipynb folder and makes use of the final chosen model. Requests can be made as post request to http://localhost:5444/predict with a json file of the following format: { "row": "features" } be careful that the sent file is valid json. The answer contains the predicted class. { "p_class": "predicted class" } For now the online predictor only predicts the class of single lines sent to it
In [5]:
# Navigate to notebooks/solution_ueb02 and start the server
# with 'python -m online'
In [ ]:
# Nun werden zeilenweise Anfragen an die REST-API simuliert, jeder valider json request wird mit einer
# json prediction response beantwortet
In [ ]:
In [ ]: