Multivaraite Datenanalyse

Lineare Diskriminanzanalyse (LDA)

Michael Araz, Daniel Hasenklever, Stefan Pede


In [ ]:
%reload_ext autoreload
%autoreload 2

In [ ]:
import numpy as np
import os
import pandas as pd
import random
import scipy

# interactive
from ipywidgets.widgets import interact, IntSlider, FloatSlider
from IPython.display import display

from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from multiDatenanalyse import *

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

mmPfad = '../data/Messmatrix.csv'

Eigenschaften der Messmatrix


In [ ]:
df = pd.read_csv(mmPfad)
print("Anzahl der Kennwerte: "+str(df.shape[1]))
print("Anzahl der vermessenen Rohre: "+str(df.shape[0]))
print("Anzahl der gefahrenen Produkte: "+str(df.groupby(["Header_Leitguete","Header_Soll_AD","Header_Soll_WD"])["Header_Pseudonummer"].agg(["count"]).shape[0]))
print("Anzahl der Walzlose: "+str(len(pd.unique(df["Header_Walzlos"]))))

Produkte


In [ ]:
df.groupby(["Header_Leitguete","Header_Soll_AD","Header_Soll_WD"])["Header_Pseudonummer"].agg(["count"])

Vorverarbeiten der Kennwerte


In [ ]:
dfVV2 = preprocess(df)

# Ausgabe
print("Daten nach Vorverarbeitung:")
print("Anzahl der Kennwerte: "+str(dfVV2.shape[1]))
print("Anzahl der vermessenen Rohre: "+str(dfVV2.shape[0]))
print("Anzahl der gefahrenen Produkte: "+str(dfVV2.groupby(["Header_Leitguete","Header_Soll_AD","Header_Soll_WD"])["Header_Pseudonummer"].agg(["count"]).shape[0]))
print("Anzahl der Walzlose: "+str(len(pd.unique(dfVV2["Header_Walzlos"]))))
print("\nAuszug:")
dfVV2.head()

In [ ]:
dfNoCor, _ = dropCorrelatedColumns((dfVV2[dfVV2.columns[6:]], dfVV2[dfVV2.columns[6:]]), 0.8)
dfVV2 = pd.concat((dfVV2[dfVV2.columns[:6]], dfNoCor), axis=1)
dfVV2.head()

Regard only one product


In [ ]:
@interact(index=IntSlider(min=0, max=26, value = 0))
def count_per_product(index):
    groupby_list, product = get_product(dfVV2, index)
    df = dfVV2.query(" & ".join(["({} == {})".format(name, param) for name, param in zip(groupby_list, product)]))
    
    df_walzlos = df.groupby(["Header_Walzlos"])["Header_Pseudonummer"].agg(["count"])
    display(df_walzlos.T)

In [ ]:
groupby_list, product = get_product(dfVV2, 3)
df = dfVV2.query(" & ".join(["({} == {})".format(name, param) for name, param in zip(groupby_list, product)]))

df_walzlos = df.groupby(["Header_Walzlos"])["Header_Pseudonummer"].agg(["count"])
b = np.asarray(df_walzlos)
plt.figure(figsize=(15,10))
_ = plt.hist(b, bins=20)

In [ ]:
min_num_walzlos = 100
walzlose_to_drop = df_walzlos[(df_walzlos['count'] < min_num_walzlos)].index.tolist()

for walzlos in walzlose_to_drop:
    df.drop(df[df["Header_Walzlos"] == walzlos].index, inplace=True)
    
label_encoder = LabelEncoder().fit(df["Header_Walzlos"])

In [ ]:
train_set, test_set = get_data(df, label_encoder)

Normalize data


In [ ]:
train_set['data'], train_mean, train_std = zscore(train_set['data'])
test_set['data'], test_mean, test_mean = zscore(test_set['data'], train_mean, train_std)

How different are train and test set?


In [ ]:
cov_train = np.cov(train_set['data'].T)
cov_test = np.cov(test_set['data'].T)

plt.figure(figsize=(15,10))
ax1 = plt.subplot(121)
ax1.imshow(225*(cov_train-np.max(cov_train))/(np.max(cov_train)-np.min(cov_train)), 'gray')
ax1.set_title('cov train')
ax1.set_xlabel('features')
ax1.set_ylabel('features')

ax2 = plt.subplot(122)
ax2.imshow(225*(cov_test-np.max(cov_test))/(np.max(cov_test)-np.min(cov_test)), 'gray')
ax2.set_title('cov test')
ax2.set_xlabel('features')
ax2.set_ylabel('features')
print('Wie ähnlich sind sich test und train set?')

LDA


In [ ]:
X_train, y_train = train_set['data'][:, :], train_set['label']
X_test, y_test = test_set['data'][:, :], test_set['label']

In [ ]:
n_components = 2

sklearn_LDA = LDA(n_components=n_components, solver='eigen')
sklearn_LDA = sklearn_LDA.fit(X_train, y_train)

In [ ]:
plt.stem(sklearn_LDA.explained_variance_ratio_)

In [ ]:
train_pred = sklearn_LDA.predict(X_train)
print('{0:.6f}% train accuracy'.format(100*np.mean(train_pred == y_train)))

test_pred = sklearn_LDA.predict(X_test)
print('{0:.6f}% test accuracy'.format(100*np.mean(test_pred == y_test)))

In [ ]:
data = sklearn_LDA.transform(X_train)

In [ ]:
data.shape

In [ ]:
def plot_lda(X_lda, y, title, ax=None):
    if ax:
        ax = ax
    else:
        plt.figure(figsize=(10,5))
        ax = plt.subplot(111)
        
    for color, label in enumerate(np.unique(y)):
        min_val = 0
        max_val = 14

        my_cmap = plt.cm.get_cmap('rainbow') # or any other one
        norm = matplotlib.colors.Normalize(min_val, max_val) # the color maps work for [0, 1]

        color_i = my_cmap(norm(color)) # returns an rgba value

        ax.scatter(X_lda[:,0][y==label], X_lda[:,1][y==label], marker='*', color=color_i,
                    label=label, alpha=1)
        
    ax.set_xlabel('LDA_1')
    ax.set_ylabel('LDA_2')
    ax.set_title(title)
    
    ax.legend()
    ax.grid()

In [ ]:
plot_lda(data, y_train, 'bla')

In [ ]:
eigvecs = sklearn_LDA.scalings_

In [ ]:
plt.imshow(np.abs(eigvecs), 'gray')

In [ ]:
df[df.columns[6:]].columns[3]

In [ ]:
eigvecs.shape

In [ ]:
np.argmax(np.abs(eigvecs[:, 1]))

In [ ]:
np.argmax(np.abs(eigvecs[:, 0]))