In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.feature_selection as FS
data = pd.read_csv("./wine_dataset.csv", delimiter=";")
data.head()
Out[28]:
Se sustituye la columna Type por un valor categórico
In [36]:
data["Type"] = pd.Categorical.from_array(data["Type"]).codes
data["Type"].replace("A",0)
data["Type"].replace("B",1)
data["Type"].replace("C",2)
data.head()
Out[36]:
In [37]:
data.describe()
Out[37]:
Separamos la columna target del resto de variables predictoras
In [41]:
data_y = data["Type"]
data_X = data.drop("Type", 1)
data_X.head()
Out[41]:
In [45]:
mi = FS.mutual_info_classif(data_X, data_y)
print(mi)
data_X.head(0)
Out[45]:
In [46]:
names=data_X.axes[1]
names
Out[46]:
In [47]:
indice=np.argsort(mi)[::-1]
print(indice)
print(names[indice])
In [ ]:
In [48]:
plt.figure(figsize=(8,6))
plt.subplot(121)
plt.scatter(data[data.Type==1].Flavanoids,data[data.Type==1].Color_Intensity, color='red')
plt.scatter(data[data.Type==2].Flavanoids,data[data.Type==2].Color_Intensity, color='blue')
plt.scatter(data[data.Type==0].Flavanoids,data[data.Type==0].Color_Intensity, color='green')
plt.title('Good Predictor Variables \n Flavanoids vs Color_Intensity')
plt.xlabel('Flavanoids')
plt.ylabel('Color_Intensity')
plt.legend(['A','B','C'])
plt.subplot(122)
plt.scatter(data[data.Type==1].Ash,data[data.Type==1].Nonflavanoid_Phenols, color='red')
plt.scatter(data[data.Type==2].Ash,data[data.Type==2].Nonflavanoid_Phenols, color='blue')
plt.scatter(data[data.Type==0].Ash,data[data.Type==0].Nonflavanoid_Phenols, color='green')
plt.title('Ash vs Nonflavanoid_Phenols')
plt.xlabel('Ash')
plt.ylabel('Nonflavanoid_Phenols')
plt.legend(['A','B','C'])
plt.show()
In [49]:
chi = FS.chi2(X = data_X, y = data["Type"])[0]
print(chi)
indice_chi=np.argsort(chi)[::-1]
print(indice_chi)
print(names[indice_chi])
In [50]:
plt.figure()
plt.scatter(data[data.Type==1].Proline,data[data.Type==1].Color_Intensity, color='red')
plt.scatter(data[data.Type==2].Proline,data[data.Type==2].Color_Intensity, color='blue')
plt.scatter(data[data.Type==0].Proline,data[data.Type==0].Color_Intensity, color='green')
plt.title('Good Predictor Variables Chi-Square \n Proline vs Color_Intensity')
plt.xlabel('Proline')
plt.ylabel('Color_Intensity')
plt.legend(['A','B','C'])
plt.show()
In [51]:
from sklearn.decomposition.pca import PCA
In [52]:
pca = PCA()
pca.fit(data_X)
plt.plot(pca.explained_variance_)
plt.ylabel("eigenvalues")
plt.xlabel("position")
plt.show()
print ("Eigenvalues\n",pca.explained_variance_)
# Percentage of variance explained for each components
print('\nExplained variance ratio (first two components):\n %s'
% str(pca.explained_variance_ratio_))
Dibujamos la proyección en las dos primeras componentes principales
In [58]:
pca = PCA(n_components=2)
X_pca = pd.DataFrame(pca.fit_transform(data_X))
pca_A = X_pca[data_y == 0]
pca_B = X_pca[data_y == 1]
pca_C = X_pca[data_y == 2]
#plot
plt.scatter(x = pca_A[0], y = pca_A[1], c="blue")
plt.scatter(x = pca_B[0], y = pca_B[1], c="turquoise")
plt.scatter(x = pca_C[0], y = pca_C[1], c="darkorange")
plt.xlabel("First Component")
plt.ylabel("Second Component")
plt.legend(["A","B","C"])
plt.show()
In [16]:
from sklearn import preprocessing
X_scaled = preprocessing.scale(data_X)
pca = PCA()
pca.fit(X_scaled)
plt.plot(pca.explained_variance_)
plt.ylabel("eigenvalues")
plt.xlabel("position")
plt.show()
print ("Eigenvalues\n",pca.explained_variance_)
# Percentage of variance explained for each components
print('\nExplained variance ratio (first two components):\n %s'
% str(pca.explained_variance_ratio_))
In [26]:
pca = PCA(n_components=2)
X_pca = pd.DataFrame(pca.fit_transform(X_scaled))
pca_A = X_pca[data_y == 'A']
pca_B = X_pca[data_y == 'B']
pca_C = X_pca[data_y == 'C']
#plot
plt.scatter(x = pca_A[0], y = pca_A[1], c="blue")
plt.scatter(x = pca_B[0], y = pca_B[1], c="turquoise")
plt.scatter(x = pca_C[0], y = pca_C[1], c="darkorange")
plt.xlabel("First Component")
plt.ylabel("Second Component")
plt.legend(["A","B","C"])
plt.show()
Linear Discriminant Analysis
A classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule.
The model fits a Gaussian density to each class.
The fitted model can also be used to reduce the dimensionality of the input by projecting it to the most discriminative directions.
In [60]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
In [66]:
lda = LDA()
lda.fit(data_X,data_y)
print("Porcentaje explicado:", lda.explained_variance_ratio_)
In [64]:
X_lda = pd.DataFrame(lda.fit_transform(data_X, data_y))
# Dividimos en los 3 tipos para ponerles diferentes colores
lda_A = X_lda[data_y == 0]
lda_B = X_lda[data_y == 1]
lda_C = X_lda[data_y == 2]
#plot
plt.scatter(x = lda_A[0], y = lda_A[1], c="blue")
plt.scatter(x = lda_B[0], y = lda_B[1], c="turquoise")
plt.scatter(x = lda_C[0], y = lda_C[1], c="darkorange")
plt.title("LDA without normalization")
plt.xlabel("First LDA Component")
plt.ylabel("Second LDA Component")
plt.legend((["A","B","C"]), loc="lower right")
plt.show()
In [ ]:
lda = LDA(n_components=2)
lda.fit(X_scaled,data_y)
print("Porcentaje explicado:", lda.explained_variance_ratio_)
In [ ]:
X_lda = pd.DataFrame(lda.fit_transform(data_X, data_y))
# Dividimos en los 3 tipos para ponerles diferentes colores
lda_A = X_lda[data_y == 0]
lda_B = X_lda[data_y == 1]
lda_C = X_lda[data_y == 2]
#plot
plt.scatter(x = lda_A[0], y = lda_A[1], c="blue")
plt.scatter(x = lda_B[0], y = lda_B[1], c="turquoise")
plt.scatter(x = lda_C[0], y = lda_C[1], c="darkorange")
plt.xlabel("First LDA Component")
plt.ylabel("Second LDA Component")
plt.legend(["A","B","C"],loc="lower right")
plt.title("LDA with normalization")
plt.show()
With this we have verified that LDA is invariant to scale
Linear Discriminant Analysis (LDA) tries to identify attributes that account for the most variance between classes. In particular, LDA, in contrast to PCA, is a supervised method, using known class labels.