In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
In [2]:
np.random.seed(1)
N = 500
x = np.random.randn(N)
y = x + 1 * np.random.randn(N)
plt.figure(figsize=(6,6))
plt.plot(x, y, 'k.')
mdl = LinearRegression()
mdl.fit(np.array([x]).T, y)
xs_mdl = [-5, 5]
ys_mdl = mdl.predict(np.array([xs_mdl]).T)
plt.plot(xs_mdl, ys_mdl, 'k--', label='regression')
pmdl = PCA()
pmdl.fit(np.stack([x,y]).T)
pca1x, pca1y = pmdl.components_[0]
x_scale = -xs_mdl[1] / pca1x
y_plt = [x_scale*pca1y, -x_scale*pca1y]
plt.plot(xs_mdl, y_plt, 'r--', label='pca')
plt.plot(xs_mdl, xs_mdl, 'k', label='x=y')
plt.xlim((-5,5))
plt.ylim((-5,5))
plt.legend(loc='best')
plt.title('Data simulated y ~ x + error')
Out[2]:
In [3]:
np.random.seed(0)
N = 500
p1 = np.random.randn(N)
p2 = .5 * np.random.randn(N)
xp = p1 + p2
yp = p1 - p2
plt.figure(figsize=(6,6))
plt.plot(xp, yp, 'k.')
mdl = LinearRegression()
mdl.fit(np.array([xp]).T, yp)
xps_mdl = [-5, 5]
yps_mdl = mdl.predict(np.array([xps_mdl]).T)
plt.plot(xps_mdl, yps_mdl, 'k--', label='regression')
pmdl = PCA()
pmdl.fit(np.stack([xp,yp]).T)
print(pmdl.components_)
pca1xp, pca1yp = pmdl.components_[0]
xp_scale = -xps_mdl[1] / pca1xp
yp_plt = [xp_scale*pca1yp, -xp_scale*pca1yp]
plt.plot(xps_mdl, yp_plt, 'r--', label='pca')
plt.plot(xs_mdl, xs_mdl, 'k', label='x=y')
plt.xlim((-5,5))
plt.ylim((-5,5))
plt.legend(loc='best')
plt.title('Data simulated as 2 independent,\nperpendicular normal distributions')
Out[3]:
In [ ]: