Machine Intelligence II (week 3) - Team MensaNord

Nikolai Zaki
Alexander Moore
Johannes Rieke
Georg Hoelger
Oliver Atanaszov



In [25]:

    
from __future__ import division, print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Exercise 1

A



In [25]:

    
data1 = pd.read_csv('pca2.csv', sep=',')
data1.ix[0:2, 'X1':'X2']
data1.ix[0:2, :]
data1.shape









    Out[25]:





(500, 2)



In [27]:

    
m = np.mean(data1, 0)
data1_centered = data1 - m
data1_centered.ix[0:2, :]



In [28]:

    
covariance1 = np.cov(data1_centered.T)
evals1, evecs1 = np.linalg.eig(covariance1)
transmat1 = evecs1.T
evec1 = transmat1[0]
evec2 = transmat1[1]

data1_trans = np.array([[0.0, 0.0] for i in range(len(data1))])
for i in range(len(data1)):
    data1_trans[i] = np.dot(transmat1, data1_centered.ix[i, :])
data1_trans[0:3, :]









    Out[28]:





array([[-1.99528617, -0.78121602],
       [ 0.1395242 ,  0.15368564],
       [ 0.39509079, -2.00405686]])



In [29]:

    
# plt.figure(figsize=(10, 10))
fig, ax = plt.subplots(figsize=(10, 10))
plt.scatter(data1_centered.ix[:, 'X1'], data1_centered.ix[:, 'X2'])
ax.set_color_cycle(['black', 'red'])
plt.plot([0, evec1[0]], [0, evec1[1]])
plt.plot([0, evec2[0]], [0, evec2[1]])
plt.grid()
plt.show()









    



/home/georg/anaconda3/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The set_color_cycle attribute was deprecated in version 1.5. Use set_prop_cycle instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)



In [30]:

    
plt.figure(figsize=(10, 10))
plt.scatter(data1_trans.T[0], data1_trans.T[1])
plt.grid()
plt.show()

B



In [31]:

    
data1b_c = np.copy(data1_centered)
data1b_c[16, :] = 0
data1b_c[156, :] = 0

cov1b = np.cov(data1b_c.T)
evals1b, evecs1b = np.linalg.eig(cov1b)
transmat1b = evecs1b.T
evec1b = transmat1b[0]
evec2b = transmat1b[1]

data1b_t = np.array([[0.0, 0.0] for i in range(len(data1))])
for i in range(len(data1)):
    data1b_t[i] = np.dot(transmat1b, data1b_c[i, :])



In [32]:

    
plt.figure(figsize=(10, 10))
plt.scatter(data1_trans.T[0], data1_trans.T[1])
plt.scatter(data1b_t.T[0], data1b_t.T[1])
plt.grid()
plt.show()

By deleting 2 of the 500 datapoints, the projection on the PCs change about 10 degrees. Hence the PCs itself change significantly.



In [32]:

Exercise 2

A



In [33]:

    
data = np.loadtxt('pca4.csv', skiprows=1, delimiter=',')
data.shape, data[:10]









    Out[33]:





((500, 4), array([[ 2.22596182,  0.10936602, -0.05005915,  0.51865154],
        [-1.07500369, -4.83287743, -0.41939855,  0.8437185 ],
        [ 0.33169257,  1.49399059, -0.09823864,  0.36632831],
        [ 0.46171386, -1.4026671 , -0.01851781,  0.31078302],
        [ 2.87407837,  3.89276164,  0.23303518, -0.09922712],
        [ 0.2306071 ,  2.37251845, -0.51103373, -0.25344405],
        [ 0.73994802, -2.16476182,  0.37020777, -0.35440685],
        [-0.99456035, -3.84759242,  0.37249191,  0.34450451],
        [ 2.51336538,  1.39672866, -0.17322848, -0.15757368],
        [-2.18370375, -0.08684707, -0.44014269, -0.26767374]]))



In [34]:

    
fig, axes = plt.subplots(2, 2, figsize=(10, 10))

outliers = [99, 199, 111, 211]

for i, ax in enumerate(axes.flatten()):
    plt.sca(ax)
    plt.scatter(range(len(data)), data[:, i], c=['r' if a in outliers else 'b' for a in range(len(data))])
    plt.ylabel('x{}'.format(i+1))
    plt.xlabel('Index')
    plt.ylim(-14, 14)

The data points at indices 99, 111, 199 and 211 (marked in red in the plot above) are apparently outliers and will be discarded.



In [35]:

    
filtered_data = np.delete(data, outliers, axis=0)
filtered_data.shape









    Out[35]:





(496, 4)

B



In [36]:

    
centered_data = filtered_data - filtered_data.mean(axis=0)
centered_data[:10]









    Out[36]:





array([[  2.22010272e+00,   5.74689244e-04,  -2.95351627e-02,
          5.31504668e-01],
       [ -1.08086279e+00,  -4.94166876e+00,  -3.98874563e-01,
          8.56571630e-01],
       [  3.25833467e-01,   1.38519925e+00,  -7.77146575e-02,
          3.79181437e-01],
       [  4.55854757e-01,  -1.51145843e+00,   2.00617505e-03,
          3.23636152e-01],
       [  2.86821927e+00,   3.78397031e+00,   2.53559167e-01,
         -8.63739942e-02],
       [  2.24747995e-01,   2.26372711e+00,  -4.90509747e-01,
         -2.40590922e-01],
       [  7.34088911e-01,  -2.27355315e+00,   3.90731752e-01,
         -3.41553719e-01],
       [ -1.00041946e+00,  -3.95638375e+00,   3.93015898e-01,
          3.57357644e-01],
       [  2.50750627e+00,   1.28793732e+00,  -1.52704498e-01,
         -1.44720548e-01],
       [ -2.18956286e+00,  -1.95638404e-01,  -4.19618702e-01,
         -2.54820606e-01]])



In [37]:

    
cov = np.cov(centered_data.T)
cov.shape









    Out[37]:





(4, 4)



In [38]:

    
evals, evecs = np.linalg.eig(cov)



In [39]:

    
plt.plot(evals, 'o')
plt.ylabel('Eigenvalue')
plt.xlabel('Component')
plt.ylim(0)









    Out[39]:





(0, 4.3703559865800043)



In [40]:

    
evals









    Out[40]:





array([ 4.16661605,  1.93983139,  0.10130627,  0.09181741])

The first two PCs seem sufficient to represent the data. The third and fourth PCs are negligible.



In [40]:

    
projected_data = np.dot(evecs.T, centered_data.T).T

fig, axes = plt.subplots(2, 2, figsize=(10, 10))

for i, ax in enumerate(axes.flatten()):
    plt.sca(ax)
    plt.plot(projected_data[:, i], '.')
    plt.ylabel('u{}'.format(i+1))
    plt.xlabel('Index')
    plt.ylim(-6, 6)

C



In [42]:

    
whitened_data = evecs.T.dot(centered_data.T).T.dot(np.diag(1 / np.sqrt(evals)))



In [43]:

    
fig, axes = plt.subplots(2, 2, figsize=(10, 10))

for i, ax in enumerate(axes.flatten()):
    plt.sca(ax)
    plt.plot(whitened_data[:, i], '.')
    plt.ylabel('w{}'.format(i+1))
    plt.xlabel('Index')
    plt.ylim(-6, 6)

D



In [44]:

    
plt.imshow(cov, cmap='Blues')
plt.colorbar(label='Covariance of original data')
plt.xlabel('Dimension')
plt.ylabel('Dimension')









    Out[44]:





<matplotlib.text.Text at 0x7fbfec9467b8>



In [45]:

    
projected_cov = np.cov(projected_data.T)
plt.imshow(projected_cov, cmap='Blues')
plt.colorbar(label='Covariance of projected data')
plt.xlabel('Dimension')
plt.ylabel('Dimension')









    Out[45]:





<matplotlib.text.Text at 0x7fbfec819438>



In [46]:

    
whitened_cov = np.cov(whitened_data.T)
plt.imshow(whitened_cov, cmap='Blues')
plt.colorbar(label='Covariance of whitened data')
plt.xlabel('Dimension')
plt.ylabel('Dimension')









    Out[46]:





<matplotlib.text.Text at 0x7fbfedaf1a58>



In [46]:



In [46]:



In [ ]:

	X1	X2
0	-1.411663	-1.612040
1	0.053112	0.200662
2	1.273300	-1.597200