In [ ]:
%reload_ext autoreload
%autoreload 2
In [ ]:
import numpy as np
import os
import pandas as pd
import random
import scipy
from scipy.stats import zscore
# interactive
from ipywidgets.widgets import interact, IntSlider, FloatSlider
from IPython.display import display
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from multiDatenanalyse import *
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
mmPfad = '../data/Messmatrix.csv'
In [ ]:
df = load_data()
In [ ]:
min_num_walzlos = 300
df_all_prod = [extract_product(df, product_id=product_id,
min_num_walzlos=min_num_walzlos) for product_id in range(26)]
In [ ]:
feature_dict = mv_features(df_all_prod)
feature_dict
In [ ]:
feature_list = list(dict(feature_dict[:10]).keys())
feature_list
In [ ]:
min_num_walzlos = 300
df_all_prod = [extract_product(df[list(df.columns[:6]) + feature_list], product_id=product_id,
min_num_walzlos=min_num_walzlos) for product_id in range(26)]
In [ ]:
@interact(index=IntSlider(min=0, max=26, value = 0))
def count_per_product(index):
print("Anzahl der Walzlose: "+str(len(pd.unique(df_all_prod[index]["Header_Walzlos"]))))
In [ ]:
product_id = 11
df_prod = df_all_prod[product_id]
print("Anzahl der Walzlose: "+str(len(pd.unique(df_prod["Header_Walzlos"]))))
In [ ]:
test_frac = 0.4
train_set, test_set = get_lda_data(df_prod, test_frac=test_frac)
In [ ]:
train_set['data'] = zscore(train_set['data'])
test_set['data'] = zscore(test_set['data'])
In [ ]:
cov_train = np.cov(train_set['data'].T)
cov_test = np.cov(test_set['data'].T)
plt.figure(figsize=(15,10))
ax1 = plt.subplot(121)
ax1.imshow(255*(cov_train-np.max(cov_train))/(np.max(cov_train)-np.min(cov_train)), 'gray')
ax1.set_title('cov train')
ax1.set_xlabel('features')
ax1.set_ylabel('features')
ax2 = plt.subplot(122)
ax2.imshow(255*(cov_test-np.max(cov_test))/(np.max(cov_test)-np.min(cov_test)), 'gray')
ax2.set_title('cov test')
ax2.set_xlabel('features')
ax2.set_ylabel('features')
print('Wie ähnlich sind sich test und train set?')
In [ ]:
# extract data and label
X_train, y_train = train_set['data'], train_set['label']
X_test, y_test = test_set['data'], test_set['label']
# number components for transofrm
n_components = 3
# LDA object
sklearn_LDA = LDA(n_components=n_components, solver='eigen')
# fit with train data
sklearn_LDA = sklearn_LDA.fit(X_train, y_train)
In [ ]:
plt.stem(sklearn_LDA.explained_variance_ratio_)
plt.xlabel('Index Eigenwert')
plt.ylabel('Beitrag zur Varianz')
In [ ]:
train_pred = sklearn_LDA.predict(X_train)
print('{0:.2f}% train accuracy'.format(100*np.mean(train_pred == y_train)))
test_pred = sklearn_LDA.predict(X_test)
print('{0:.2f}% test accuracy'.format(100*np.mean(test_pred == y_test)))
In [ ]:
data = sklearn_LDA.transform(X_train)
plot_lda(data, y_train, 'First three LDA components')
In [ ]:
%matplotlib notebook
X_lda = data
y = y_train
coef_ = sklearn_LDA.coef_
eigvecs = sklearn_LDA.scalings_
labels = feature_list
xs = X_lda[:, 0]
ys = X_lda[:, 1]
zs = X_lda[:, 2]
scalex = 1.0/(xs.max()- xs.min())
scaley = 1.0/(ys.max()- ys.min())
scalez = 1.0/(zs.max()- zs.min())
my_cmap = plt.cm.get_cmap('rainbow') # or any other one
norm = matplotlib.colors.Normalize(min_val, max_val) # the color maps work for [0, 1]
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
for feat in range(coef_.shape[1]):
color_i = my_cmap(norm(feat)) # returns an rgba value
vector=np.array([0,0,0, eigvecs[0, feat], eigvecs[1, feat], eigvecs[2, feat]])
#vlength = np.linalg.norm((vector[3], vector[4], vector[5]))
ax.quiver(vector[0],vector[1],vector[2],vector[3],vector[4],vector[5],
pivot='tail', color=color_i)#length=vlength,arrow_length_ratio=0.3/vlength)
ax.text(eigvecs[0, feat]* 1.15,
eigvecs[1, feat] * 1.15,
eigvecs[2, feat] * 1.15,
labels[feat], color=color_i, ha='center', va='center')
for color, label in enumerate(np.unique(y)):
min_val = 0
max_val = len(np.unique(y))
my_cmap = plt.cm.get_cmap('rainbow') # or any other one
norm = matplotlib.colors.Normalize(min_val, max_val) # the color maps work for [0, 1]
color_i = my_cmap(norm(color)) # returns an rgba value
#ax.scatter(X_lda[:,0][y==label]*scalex,
# X_lda[:,1][y==label]*scaley,
# X_lda[:,2][y==label]*scalez, marker='*', color=color_i,
# label=label, alpha=1)
ax.set_xlim((-2,2))
ax.set_ylim((-2,2))
ax.set_zlim((-1,1))
ax.set_xlabel('LDA_1')
ax.set_ylabel('LDA_2')
ax.set_zlabel('LDA_3')
In [ ]:
eigvecs = sklearn_LDA.scalings_
plt.figure(figsize=(20,5))
plt.imshow(np.abs(eigvecs), 'gray')
_ = plt.axis('off')
In [ ]:
print('Most valuable component in most valuable EV: {}'.format(df[df.columns[6:]].columns[np.argmax(np.abs(eigvecs[:, 0]))]))
print('Most valuable component in second valuable EV: {}'.format(df[df.columns[6:]].columns[np.argmax(np.abs(eigvecs[:, 1]))]))
In [ ]:
plt.figure(figsize=(25,5))
for index in range(3):
ax = plt.subplot(1,3,index+1)
ax.stem(eigvecs[:, index])
ax.set_title('Eigenvector {}'.format(index))
ax.set_xlabel('Merkmalsindex')
In [ ]: