In [25]:
from __future__ import division, print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [25]:
data1 = pd.read_csv('pca2.csv', sep=',')
data1.ix[0:2, 'X1':'X2']
data1.ix[0:2, :]
data1.shape
Out[25]:
In [27]:
m = np.mean(data1, 0)
data1_centered = data1 - m
data1_centered.ix[0:2, :]
Out[27]:
In [28]:
covariance1 = np.cov(data1_centered.T)
evals1, evecs1 = np.linalg.eig(covariance1)
transmat1 = evecs1.T
evec1 = transmat1[0]
evec2 = transmat1[1]
data1_trans = np.array([[0.0, 0.0] for i in range(len(data1))])
for i in range(len(data1)):
data1_trans[i] = np.dot(transmat1, data1_centered.ix[i, :])
data1_trans[0:3, :]
Out[28]:
In [29]:
# plt.figure(figsize=(10, 10))
fig, ax = plt.subplots(figsize=(10, 10))
plt.scatter(data1_centered.ix[:, 'X1'], data1_centered.ix[:, 'X2'])
ax.set_color_cycle(['black', 'red'])
plt.plot([0, evec1[0]], [0, evec1[1]])
plt.plot([0, evec2[0]], [0, evec2[1]])
plt.grid()
plt.show()
In [30]:
plt.figure(figsize=(10, 10))
plt.scatter(data1_trans.T[0], data1_trans.T[1])
plt.grid()
plt.show()
In [31]:
data1b_c = np.copy(data1_centered)
data1b_c[16, :] = 0
data1b_c[156, :] = 0
cov1b = np.cov(data1b_c.T)
evals1b, evecs1b = np.linalg.eig(cov1b)
transmat1b = evecs1b.T
evec1b = transmat1b[0]
evec2b = transmat1b[1]
data1b_t = np.array([[0.0, 0.0] for i in range(len(data1))])
for i in range(len(data1)):
data1b_t[i] = np.dot(transmat1b, data1b_c[i, :])
In [32]:
plt.figure(figsize=(10, 10))
plt.scatter(data1_trans.T[0], data1_trans.T[1])
plt.scatter(data1b_t.T[0], data1b_t.T[1])
plt.grid()
plt.show()
By deleting 2 of the 500 datapoints, the projection on the PCs change about 10 degrees. Hence the PCs itself change significantly.
In [32]:
In [33]:
data = np.loadtxt('pca4.csv', skiprows=1, delimiter=',')
data.shape, data[:10]
Out[33]:
In [34]:
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
outliers = [99, 199, 111, 211]
for i, ax in enumerate(axes.flatten()):
plt.sca(ax)
plt.scatter(range(len(data)), data[:, i], c=['r' if a in outliers else 'b' for a in range(len(data))])
plt.ylabel('x{}'.format(i+1))
plt.xlabel('Index')
plt.ylim(-14, 14)
The data points at indices 99, 111, 199 and 211 (marked in red in the plot above) are apparently outliers and will be discarded.
In [35]:
filtered_data = np.delete(data, outliers, axis=0)
filtered_data.shape
Out[35]:
In [36]:
centered_data = filtered_data - filtered_data.mean(axis=0)
centered_data[:10]
Out[36]:
In [37]:
cov = np.cov(centered_data.T)
cov.shape
Out[37]:
In [38]:
evals, evecs = np.linalg.eig(cov)
In [39]:
plt.plot(evals, 'o')
plt.ylabel('Eigenvalue')
plt.xlabel('Component')
plt.ylim(0)
Out[39]:
In [40]:
evals
Out[40]:
The first two PCs seem sufficient to represent the data. The third and fourth PCs are negligible.
In [40]:
projected_data = np.dot(evecs.T, centered_data.T).T
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
for i, ax in enumerate(axes.flatten()):
plt.sca(ax)
plt.plot(projected_data[:, i], '.')
plt.ylabel('u{}'.format(i+1))
plt.xlabel('Index')
plt.ylim(-6, 6)
In [42]:
whitened_data = evecs.T.dot(centered_data.T).T.dot(np.diag(1 / np.sqrt(evals)))
In [43]:
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
for i, ax in enumerate(axes.flatten()):
plt.sca(ax)
plt.plot(whitened_data[:, i], '.')
plt.ylabel('w{}'.format(i+1))
plt.xlabel('Index')
plt.ylim(-6, 6)
In [44]:
plt.imshow(cov, cmap='Blues')
plt.colorbar(label='Covariance of original data')
plt.xlabel('Dimension')
plt.ylabel('Dimension')
Out[44]:
In [45]:
projected_cov = np.cov(projected_data.T)
plt.imshow(projected_cov, cmap='Blues')
plt.colorbar(label='Covariance of projected data')
plt.xlabel('Dimension')
plt.ylabel('Dimension')
Out[45]:
In [46]:
whitened_cov = np.cov(whitened_data.T)
plt.imshow(whitened_cov, cmap='Blues')
plt.colorbar(label='Covariance of whitened data')
plt.xlabel('Dimension')
plt.ylabel('Dimension')
Out[46]:
In [46]:
In [46]:
In [ ]: