notebook.community

Edit and run



In [1]:

    
import warnings
warnings.filterwarnings('ignore')



In [2]:

    
%matplotlib inline



In [3]:

    
import numpy as np
from sklearn.decomposition import PCA



In [4]:

    
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2], [4, 3], [4, -1]])
# X = np.array([[-1, 1], [-2, 2], [-3, 3], [1, 1], [2, 2], [3, 3], [4, 4]])



In [5]:

    
X









    Out[5]:





array([[-1, -1],
       [-2, -1],
       [-3, -2],
       [ 1,  1],
       [ 2,  1],
       [ 3,  2],
       [ 4,  3],
       [ 4, -1]])



In [6]:

    
import matplotlib.pyplot as plt



In [7]:

    
plt.figure(figsize=(10,10))
plt.scatter(X[:, 0], X[:, 1])
# plt.savefig('original.png')









    Out[7]:





<matplotlib.collections.PathCollection at 0x1a74bfce4a8>

Scale



In [8]:

    
from sklearn.preprocessing import StandardScaler



In [9]:

    
scaler = StandardScaler()



In [10]:

    
scaler.fit(X)









    Out[10]:





StandardScaler(copy=True, with_mean=True, with_std=True)



In [15]:

    
scaler.mean_









    Out[15]:





array([1.  , 0.25])



In [16]:

    
scaler.var_









    Out[16]:





array([6.5   , 2.6875])



In [17]:

    
X_scaled = scaler.transform(X)



In [18]:

    
plt.scatter(X_scaled[:, 0], X_scaled[:, 1])









    Out[18]:





<matplotlib.collections.PathCollection at 0x1a74c3e99e8>

Find linear correlations



In [15]:

    
import pandas as pd



In [16]:

    
df = pd.DataFrame(X_scaled)



In [17]:

    
corr_mat = df.corr()



In [18]:

    
corr_mat



In [19]:

    
import seaborn as sns



In [20]:

    
plt.figure(figsize=(10,10))
sns.heatmap(corr_mat, annot=True)
# plt.savefig('correlation.png')









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x2087012cd68>

Eigenvectors on the correlation matrix will be the principal components



In [21]:

    
pca = PCA(n_components=2)
pca.fit(X)









    Out[21]:





PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)



In [22]:

    
pca.explained_variance_









    Out[22]:





array([9.3121762, 1.1878238])



In [23]:

    
# sum is 1, first pc has a very high variance, i.e. is very good, second could be deleted
pca.explained_variance_ratio_









    Out[23]:





array([0.88687392, 0.11312608])



In [24]:

    
X_reduced = pca.transform(X)



In [25]:

    
plt.figure(figsize=(10,10))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1])
# plt.savefig('reduced.png')









    Out[25]:





<matplotlib.collections.PathCollection at 0x2086fa26518>



In [ ]:

	0	1
0	1.000000	0.717778
1	0.717778	1.000000