In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA, KernelPCA, LatentDirichletAllocation as LDA
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions
%matplotlib inline
In [2]:
wine = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header = None)
wine.head()
Out[2]:
In [3]:
names = """Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,
Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,
Color intensity,Hue,OD280/OD315 of diluted wines,Proline""".replace("\n", "").split(",")
In [4]:
wine.columns = names
wine.head()
Out[4]:
In [5]:
print("Unique classes: ", wine["Class"].unique())
Let's see the class distribution
In [6]:
wine.Class.value_counts().sort_index().plot(kind = "bar")
Out[6]:
In [7]:
X, y = wine.iloc[:, 1:].values, wine.iloc[:, 0].values
In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print("Train X dimension: %s, Test X dimension: %s" % (X_train.shape, X_test.shape))
In [9]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
In [10]:
X_train_cov = np.cov(X_train_std.T)
# cov function considers: each row of m represents a variable,
# and each column a single observation of all those variables.
# resulting output will be d x d matrix, where d is no of the rows.
print("Dim of X_train: ", X_train_std.shape)
print("Dim of X_train_cov: ", X_train_cov.shape)
In [11]:
eigen_vals, eigen_vectors = np.linalg.eig(X_train_cov)
print("Eigen values: %s" % (eigen_vals))
print("Eigen vectors, shape: ", eigen_vectors.shape)
In [12]:
explained_variance = eigen_vals / eigen_vals.sum()
plt.bar(range(len(explained_variance)), explained_variance, label = "Explained variance ratio")
plt.step(range(len(explained_variance)), np.cumsum(explained_variance)
, label = "Commulative explained variance ratio")
plt.xlim(-1, 14)
plt.ylim(0, 1.5)
plt.xlabel("Principle Components")
plt.ylabel("Explained Variance Ratio")
plt.legend(loc = "best")
Out[12]:
In [13]:
k = 2
W = eigen_vectors[:, :k]
W
Out[13]:
In [14]:
X_train_pca = X_train_std.dot(W)
X_test_pca = X_test_std.dot(W)
X_combined_pca = np.vstack((X_train_pca, X_test_pca))
y_combined = np.hstack((y_train, y_test))
In [15]:
markers = ["o", "x", "v"]
colors = ["red", "blue", "green"]
for idx, cl in enumerate(np.unique(y_train)):
X_train_filtered = X_train_pca[y_train == cl, :]
plt.scatter(X_train_filtered[:, 0], X_train_filtered[:, 1], marker = markers[idx], color = colors[idx])
plt.xlabel("PC1")
plt.ylabel("PC2")
Out[15]:
In [16]:
lr = LogisticRegression(C = 1000, random_state = 123, penalty = "l2")
lr.fit(X_train_pca, y_train)
score = lr.score(X_test_pca, y_test)
print("Accuracy score on pca data: %.4f" % score)
plt.figure(figsize = (15, 10))
plot_decision_regions(X_combined_pca, y_combined, lr)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Decision region for the wine classification on PCA data")
plt.legend(loc = "lower left")
Out[16]:
In [17]:
pca = PCA(n_components = 2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
X_combined_pca = np.vstack((X_train_pca, X_test_pca))
y_combined = np.hstack((y_train, y_test))
lr = LogisticRegression(C = 1000, random_state = 0, penalty = "l2")
lr.fit(X_train_pca, y_train)
test_pred= lr.predict(X_test_pca)
score = accuracy_score(test_pred, y_test)
print("Accuracy score on pca data: %.4f" % score)
plt.figure(figsize = (15, 10))
plot_decision_regions(X_combined_pca, y_combined, lr)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Decision region for the wine classification on PCA hyperplane")
Out[17]:
In [18]:
X_train_pca[:10, :]
Out[18]:
In [19]:
pca = PCA(n_components = None)
pca.fit(X_train_std)
Out[19]:
In [20]:
ratios = pca.explained_variance_ratio_
plt.bar(range(len(ratios)), ratios, label = "Explained variance ratio")
plt.step(range(len(ratios)), np.cumsum(ratios), label = "Cumsum of Explained variance ratio")
plt.legend(loc = "best")
plt.xlabel("Principle Components")
plt.ylabel("Explained variance ratio")
Out[20]:
In [21]:
kpca = KernelPCA(n_components = 2, kernel = "sigmoid", gamma = 10)
kpca.fit(X_train_std)
X_train_kpca = kpca.transform(X_train_std)
X_test_kpca = kpca.transform(X_test_std)
X_combined_kpca = np.vstack((X_train_kpca, X_test_kpca))
y_combined = np.hstack((y_train, y_test))
lr = LogisticRegression(C = 1000, random_state = 0, penalty = "l2")
lr.fit(X_train_kpca, y_train)
test_pred= lr.predict(X_test_kpca)
score = accuracy_score(test_pred, y_test)
print("Accuracy score on kpca data: %.4f" % score)
plt.figure(figsize = (15, 10))
plot_decision_regions(X_combined_kpca, y_combined, lr)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Decision region for the wine classification on PCA hyperplane")
Out[21]:
In [ ]: