PCA involves following broad level steps –
1. Standardize the d-dimensional dataset.
2. Construct the covariance matrix.
3. Decompose the covariance matrix into its eigenvectors and eigenvalues.
4. Select k eigenvectors that correspond to the k largest eigenvalues, where k is the dimensionality of the new feature subspace (k≤d)
5. Construct a projection matrix W from the "top" k eigenvectors.
6. Transform the d-dimensional input dataset x using the projection matrix W to obtain the new k-dimensional feature subspace
In [1]:
# Import the modules
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import zscore
In [2]:
# Read the dataset
dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header=None)
In [34]:
# Descriptive analytics
print("Shape of the dataset: ", dataset.shape)
dataset.columns = ['class', 'alcohol', 'malic_acid', 'ash', 'alcalinity_ash',
'magnesium', 'total_phenol', 'flavanoids', 'nonflavanoid_phenols',
'proanthocyanins', 'color_intensity', 'hue', 'diluted_wines',
'proline']
In [35]:
# Displaying the top 5 rows of the dataset
dataset.head(5)
Out[35]:
In [5]:
# Check for null values
dataset.isnull().values.sum()
Out[5]:
1st attribute is class identifier (1-3). Other attributes are below:
So we will consider 13 attributes for PCA.
In [6]:
# Excluding first attribute
X = dataset.iloc[:, 1:].values
In [7]:
# Standardize the dataset
sc_X = StandardScaler()
X_std = sc_X.fit_transform(X)
X_std.shape
Out[7]:
In [8]:
# Display the standardized dataset
X_std[:3, :]
Out[8]:
In [9]:
cov_matrix = np.cov(X_std.transpose())
cov_matrix
Out[9]:
In [10]:
# Print the covariance matrix
plt.figure(figsize=(15, 15))
sns.heatmap(cov_matrix, annot=True, cmap="Greens")
Out[10]:
In [11]:
# Pair plot for this dataset
sns.pairplot(pd.DataFrame(X_std))
Out[11]:
In [12]:
# Converting to eigen values and eigen vectors
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
In [13]:
eigen_pairs = [(np.abs(eig_vals[i]), eig_vecs[ :, i]) for i in range(len(eig_vals))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
eigen_pairs.sort()
eigen_pairs.reverse()
In [14]:
eigen_pairs
Out[14]:
In [16]:
# Display the eigen Vectors
print("Eigen Vectors:")
pd.DataFrame(eig_vecs)
Out[16]:
In [17]:
# Display the eigen values
print("Eigen Values:")
pd.DataFrame(eig_vals).transpose()
Out[17]:
In [18]:
eig_vecs.shape
Out[18]:
In [19]:
tot = sum(eig_vals)
var_exp = [( i /tot ) * 100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
In [20]:
# Ploting
plt.figure(figsize=(10 , 5))
plt.bar(range(13), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(13), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
In [21]:
# Select top 7 eigen vectors
eig_vecs_selected = eig_vecs[:, :7]
In [22]:
# Projection matrix
W = eig_vecs[:, :7]
In [23]:
# Display the eigen Vectors
print("First 7 Eigen Vectors:")
pd.DataFrame(W)
Out[23]:
In [24]:
W.shape
Out[24]:
In [25]:
X_std.shape
Out[25]:
In [26]:
# creating new subspace
new_features = np.dot(X_std, W)
pd.DataFrame(new_features)
Out[26]:
Using PCA, we were able to reduce the huge 13 dimensional dataset to 7 dimensional subspace, by retaining 89.33% of the variance of the original dataset.
We used 6 steps to apply Principal Component Analysis on our dataset.
This can also accomplished using PCA implementation from scikit-learn. Ex:
from sklearn.decomposition import PCA as sklearnPCA
In [27]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header=None)
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
# Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
In [28]:
# Model score
classifier.score(X_test, y_test)
Out[28]:
In [29]:
# Explained Variance
explained_variance
Out[29]:
In [30]:
sns.heatmap(cm, annot=True)
Out[30]:
In [32]:
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('SVM (Test set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()