In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%matplotlib inline

In [3]:
import numpy as np
from sklearn.decomposition import PCA

In [4]:
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2], [4, 3], [4, -1]])
# X = np.array([[-1, 1], [-2, 2], [-3, 3], [1, 1], [2, 2], [3, 3], [4, 4]])

In [5]:
X


Out[5]:
array([[-1, -1],
       [-2, -1],
       [-3, -2],
       [ 1,  1],
       [ 2,  1],
       [ 3,  2],
       [ 4,  3],
       [ 4, -1]])

In [6]:
import matplotlib.pyplot as plt

In [7]:
plt.figure(figsize=(10,10))
plt.scatter(X[:, 0], X[:, 1])
# plt.savefig('original.png')


Out[7]:
<matplotlib.collections.PathCollection at 0x1a74bfce4a8>

Scale


In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()

In [10]:
scaler.fit(X)


Out[10]:
StandardScaler(copy=True, with_mean=True, with_std=True)

In [15]:
scaler.mean_


Out[15]:
array([1.  , 0.25])

In [16]:
scaler.var_


Out[16]:
array([6.5   , 2.6875])

In [17]:
X_scaled = scaler.transform(X)

In [18]:
plt.scatter(X_scaled[:, 0], X_scaled[:, 1])


Out[18]:
<matplotlib.collections.PathCollection at 0x1a74c3e99e8>

Find linear correlations


In [15]:
import pandas as pd

In [16]:
df = pd.DataFrame(X_scaled)

In [17]:
corr_mat = df.corr()

In [18]:
corr_mat


Out[18]:
0 1
0 1.000000 0.717778
1 0.717778 1.000000

In [19]:
import seaborn as sns

In [20]:
plt.figure(figsize=(10,10))
sns.heatmap(corr_mat, annot=True)
# plt.savefig('correlation.png')


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x2087012cd68>

Eigenvectors on the correlation matrix will be the principal components


In [21]:
pca = PCA(n_components=2)
pca.fit(X)


Out[21]:
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [22]:
pca.explained_variance_


Out[22]:
array([9.3121762, 1.1878238])

In [23]:
# sum is 1, first pc has a very high variance, i.e. is very good, second could be deleted
pca.explained_variance_ratio_


Out[23]:
array([0.88687392, 0.11312608])

In [24]:
X_reduced = pca.transform(X)

In [25]:
plt.figure(figsize=(10,10))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1])
# plt.savefig('reduced.png')


Out[25]:
<matplotlib.collections.PathCollection at 0x2086fa26518>

In [ ]: