In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import tensorflow as tf
%matplotlib inline
base_path = '/home/emp/ownCloud/Private/Master/python/Data/BA_Thiel/'
# base_path = '/media/emp/Thiel/BA_Caro/'
feature_path = os.path.join(base_path, 'features.hdf5')
In [2]:
store = pd.HDFStore(feature_path)
features = store['features']
store.close()
In [3]:
features.head(2)
Out[3]:
In [4]:
features.groupby(["Header_Leitguete","Header_Soll_AD","Header_Soll_WD"])["Header_Pseudonummer"].agg(["count"])
Out[4]:
In [5]:
# 2 Gruppen erstellen: Eine für hohe, eine für niedrige Exzentrizität. Das Netz wird später auf die Gruppenlabels (0 oder 1) trainiert
N = 8000
selection = features
g1 = selection.nlargest(N, 'STB_Ex_mitte')
g2 = selection.nsmallest(N, 'STB_Ex_mitte')
fig1, ax1 = plt.subplots(1, 1, figsize=(10, 4))
fig2, ax2 = plt.subplots(1, 1, figsize=(10, 4))
g1.hist(column='STB_Ex_mitte', bins=70, normed=True, ax=ax1)
g2.hist(column='STB_Ex_mitte', bins=70, normed=True, ax=ax2)
ax1.set(title='large_ex')
ax2.set(title='small_ex')
Out[5]:
In [6]:
g1['target'] = 0
g2['target'] = 1
g_total = pd.concat([g1, g2])
g_total.head(2)
Out[6]:
In [7]:
# Daten Sauber machen (Von Michi)
df=g_total
relNaNsCol = np.array(np.sum(np.isnan(df))/df.shape[0]*100)
# schmeiße zunächst alle Spalten heraus, die mehr als bestimmte Prozent an NaNs haben
spaltenSchranke = 15 # % der NaNs in Spalte
keep = [i for i in np.arange(len(relNaNsCol)) if relNaNsCol[i] <= spaltenSchranke]
dfVV = df[df.columns[keep]] # extrahiere Spalten
# gleiches auf Zeilen anwenden
zeilenSchranke = 5 # % der NaNs in Zeile
relNaNsRow = np.array(dfVV.isnull().sum(axis=1)/dfVV.shape[1]*100)
keep = [i for i in np.arange(len(relNaNsRow)) if relNaNsRow[i] <= zeilenSchranke]
dfVV2 = dfVV.iloc[keep] #extraheire Zeilen
#übrige NaNs mit Mittelwert aus Spalten auffüllen
dfVV2 = dfVV2.fillna(dfVV2.mean())
g_clean = dfVV2.iloc[:,:]
# Ausgabe
print("Daten nach Vorverarbeitung:")
print("Anzahl der Kennwerte: "+str(dfVV2.shape[1]))
print("Anzahl der vermessenen Rohre: "+str(dfVV2.shape[0]))
print("Anzahl der gefahrenen Produkte: "+str(dfVV2.groupby(["Header_Leitguete","Header_Soll_AD","Header_Soll_WD"])["Header_Pseudonummer"].agg(["count"]).shape[0]))
print("Anzahl der Walzlose: "+str(len(pd.unique(dfVV2["Header_Walzlos"]))))
print("\nAuszug:")
dfVV2.head()
Out[7]:
In [8]:
# Daten für das Netz vorbereiten
X_total = stats.zscore(np.array(g_clean.iloc[:, 6:-1]),axis=0)
y_total = np.array(g_clean.target, dtype=np.bool)
y_total = np.array([y_total, ~y_total]).T # One Hot encoding der Targets (Ist für das Netz einfacher zu trainieren)
In [9]:
# Split nach Walzlosen, nicht nach Rohren, um auswendiglernen des Netztes zu schlechten Ergebnissen führen zu lassen.
walzlose =np.unique(g_clean.Header_Walzlos)
np.random.shuffle(walzlose)
test_anteil = 0.4
w_test, w_train = np.hsplit(walzlose, [np.asarray(np.floor(test_anteil*len(walzlose)), dtype=np.int)])
test_pattern = np.array(g_clean.Header_Walzlos.isin(w_test))
train_pattern = ~test_pattern
In [10]:
X_test, y_test, X_train, y_train = (X_total[test_pattern, :], y_total[test_pattern, :], X_total[train_pattern, :], y_total[train_pattern, :])
idx = np.random.permutation(X_test.shape[0])
teX = X_test[idx, :]
teY = y_test[idx, :]
idx = np.random.permutation(X_train.shape[0])
trX = X_train[idx, :]
trY = y_train[idx, :]
In [11]:
# Modell für Tensorflow (Feed-Forward mit Dropout, geklaut von hier: https://github.com/nlintz/TensorFlow-Tutorials/blob/master/04_modern_net.ipynb)
def model(X, w_h, w_h2, w_o, p_keep_input, p_keep_hidden):
X = tf.nn.dropout(X, p_keep_input)
h = tf.nn.relu(tf.matmul(X, w_h))
h = tf.nn.dropout(h, p_keep_hidden)
h2 = tf.nn.relu(tf.matmul(h, w_h2))
h2 = tf.nn.dropout(h2, p_keep_hidden)
return tf.matmul(h2, w_o)
def init_weights(shape):
return tf.Variable(tf.random_normal(shape, stddev=0.01))
In [15]:
X = tf.placeholder("float", [None, X_total.shape[1]]) # None-Dimension für Batch-Size freihalten
Y = tf.placeholder("float", [None, 2]) # 2 Ausgänge für 2 Klassen da One-Hot kodierung
neurons = 300
w_h = init_weights([X_total.shape[1], neurons])
w_h2 = init_weights([neurons, neurons])
w_o = init_weights([neurons, 2])
p_keep_input = tf.placeholder("float")
p_keep_hidden = tf.placeholder("float")
py_x = model(X, w_h, w_h2, w_o, p_keep_input, p_keep_hidden)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(py_x, Y))
train_op = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08).minimize(cost)
# train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
predict_op = tf.argmax(py_x, 1)
In [16]:
batch_size = 30
with tf.Session() as sess:
# you need to initialize all variables
tf.initialize_all_variables().run()
for i in range(50):
for start, end in zip(range(0, len(trX), batch_size), range(batch_size, len(trX)+1, batch_size)):
sess.run(train_op, feed_dict={X: trX[start:end], Y: trY[start:end],
p_keep_input: 0.6, p_keep_hidden: 0.5})
print('Epoche: ' + str(i) + ' Klassifikationsrate: ' +
str(np.mean(np.argmax(teY, axis=1) ==
sess.run(predict_op, feed_dict={X: teX, Y: teY,
p_keep_input: 1.0,
p_keep_hidden: 1.0}))
)
)
In [ ]:
In [ ]:
In [ ]: