In [ ]:
%reload_ext autoreload
%autoreload 2

In [ ]:
import numpy as np
import os
import pandas as pd
import random
import scipy

from scipy.stats import zscore

# interactive
from ipywidgets.widgets import interact, IntSlider, FloatSlider
from IPython.display import display


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from multiDatenanalyse import *

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

mmPfad = '../data/Messmatrix.csv'

Load data


In [ ]:
df = load_data()

In [ ]:
min_num_walzlos = 300
df_all_prod = [extract_product(df, product_id=product_id,
                               min_num_walzlos=min_num_walzlos) for product_id in range(26)]

Extract most valuable features


In [ ]:
feature_dict = mv_features(df_all_prod)
feature_dict

In [ ]:
feature_list = list(dict(feature_dict[:10]).keys())
feature_list

Params

Extract data according to product


In [ ]:
min_num_walzlos = 300
df_all_prod = [extract_product(df[list(df.columns[:6]) + feature_list], product_id=product_id,
                               min_num_walzlos=min_num_walzlos) for product_id in range(26)]

In [ ]:
@interact(index=IntSlider(min=0, max=26, value = 0))
def count_per_product(index):
    print("Anzahl der Walzlose: "+str(len(pd.unique(df_all_prod[index]["Header_Walzlos"]))))

In [ ]:
product_id = 11
df_prod = df_all_prod[product_id]
print("Anzahl der Walzlose: "+str(len(pd.unique(df_prod["Header_Walzlos"]))))

Rearrange data for lda


In [ ]:
test_frac = 0.4
train_set, test_set = get_lda_data(df_prod, test_frac=test_frac)

Normalize data


In [ ]:
train_set['data'] = zscore(train_set['data'])
test_set['data'] = zscore(test_set['data'])

How different are train and test set


In [ ]:
cov_train = np.cov(train_set['data'].T)
cov_test = np.cov(test_set['data'].T)

plt.figure(figsize=(15,10))
ax1 = plt.subplot(121)
ax1.imshow(255*(cov_train-np.max(cov_train))/(np.max(cov_train)-np.min(cov_train)), 'gray')
ax1.set_title('cov train')
ax1.set_xlabel('features')
ax1.set_ylabel('features')

ax2 = plt.subplot(122)
ax2.imshow(255*(cov_test-np.max(cov_test))/(np.max(cov_test)-np.min(cov_test)), 'gray')
ax2.set_title('cov test')
ax2.set_xlabel('features')
ax2.set_ylabel('features')
print('Wie ähnlich sind sich test und train set?')

LDA


In [ ]:
# extract data and label
X_train, y_train = train_set['data'], train_set['label']
X_test, y_test = test_set['data'], test_set['label']

# number components for transofrm
n_components = 3

# LDA object
sklearn_LDA = LDA(n_components=n_components, solver='eigen')

# fit with train data
sklearn_LDA = sklearn_LDA.fit(X_train, y_train)

Explained Variance Ratio


In [ ]:
plt.stem(sklearn_LDA.explained_variance_ratio_)
plt.xlabel('Index Eigenwert')
plt.ylabel('Beitrag zur Varianz')

How good can LDA classify?


In [ ]:
train_pred = sklearn_LDA.predict(X_train)
print('{0:.2f}% train accuracy'.format(100*np.mean(train_pred == y_train)))

test_pred = sklearn_LDA.predict(X_test)
print('{0:.2f}% test accuracy'.format(100*np.mean(test_pred == y_test)))

Plot LDA


In [ ]:
data = sklearn_LDA.transform(X_train)
plot_lda(data, y_train, 'First three LDA components')

In [ ]:
%matplotlib notebook
X_lda = data
y = y_train
coef_ = sklearn_LDA.coef_
eigvecs = sklearn_LDA.scalings_

labels = feature_list
 
xs = X_lda[:, 0]
ys = X_lda[:, 1]
zs = X_lda[:, 2]

scalex = 1.0/(xs.max()- xs.min())
scaley = 1.0/(ys.max()- ys.min())
scalez = 1.0/(zs.max()- zs.min())

my_cmap = plt.cm.get_cmap('rainbow') # or any other one
norm = matplotlib.colors.Normalize(min_val, max_val) # the color maps work for [0, 1]


fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
for feat in range(coef_.shape[1]):
    color_i = my_cmap(norm(feat)) # returns an rgba value
    
    vector=np.array([0,0,0, eigvecs[0, feat], eigvecs[1, feat], eigvecs[2, feat]])
    #vlength = np.linalg.norm((vector[3], vector[4], vector[5]))
    ax.quiver(vector[0],vector[1],vector[2],vector[3],vector[4],vector[5],
            pivot='tail', color=color_i)#length=vlength,arrow_length_ratio=0.3/vlength)
    ax.text(eigvecs[0, feat]* 1.15,
             eigvecs[1, feat] * 1.15,
             eigvecs[2, feat] * 1.15,
             labels[feat], color=color_i, ha='center', va='center')

for color, label in enumerate(np.unique(y)):
    min_val = 0
    max_val = len(np.unique(y))

    my_cmap = plt.cm.get_cmap('rainbow') # or any other one
    norm = matplotlib.colors.Normalize(min_val, max_val) # the color maps work for [0, 1]

    color_i = my_cmap(norm(color)) # returns an rgba value

    #ax.scatter(X_lda[:,0][y==label]*scalex,
    #           X_lda[:,1][y==label]*scaley,
    #           X_lda[:,2][y==label]*scalez, marker='*', color=color_i,
    #            label=label, alpha=1)

ax.set_xlim((-2,2))
ax.set_ylim((-2,2))
ax.set_zlim((-1,1))    
    
ax.set_xlabel('LDA_1')
ax.set_ylabel('LDA_2')
ax.set_zlabel('LDA_3')

Interpret LDA


In [ ]:
eigvecs = sklearn_LDA.scalings_
plt.figure(figsize=(20,5))
plt.imshow(np.abs(eigvecs), 'gray')
_ = plt.axis('off')

In [ ]:
print('Most valuable component in most valuable EV: {}'.format(df[df.columns[6:]].columns[np.argmax(np.abs(eigvecs[:, 0]))]))
print('Most valuable component in second valuable EV: {}'.format(df[df.columns[6:]].columns[np.argmax(np.abs(eigvecs[:, 1]))]))

In [ ]:
plt.figure(figsize=(25,5))

for index in range(3):
    ax = plt.subplot(1,3,index+1)
    ax.stem(eigvecs[:, index])
    ax.set_title('Eigenvector {}'.format(index))
    ax.set_xlabel('Merkmalsindex')

In [ ]: